In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch

from neo4j import GraphDatabase

# Neo4j connection details
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "your_password"  # Set during Neo4j setup

# Create a driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))


# Test connection to Neo4j
def test_connection(tx):
    result = tx.run("RETURN 1")
    return result.single()[0]

with driver.session() as session:
    result = session.execute_read(test_connection)
    print(f"Connection successful: {result}")

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Connection successful: 1
Using device: cpu
In [13]:
def purge_neo4j():
    """
    Purge all nodes, relationships, and GDS graph projections from the Neo4j database.
    """
    with driver.session() as session:
        # Step 1: Delete all nodes and relationships
        session.run("MATCH (n) DETACH DELETE n")
        print("Deleted all nodes and relationships")
        
        # Step 2: Drop all GDS graph projections
        drop_graphs_query = """
        CALL gds.graph.list()
        YIELD graphName
        CALL gds.graph.drop(graphName, false)
        YIELD graphName AS droppedGraph
        RETURN droppedGraph
        """
        result = session.run(drop_graphs_query)
        dropped_graphs = [record["droppedGraph"] for record in result]
        if dropped_graphs:
            print(f"Dropped GDS graphs: {dropped_graphs}")
        else:
            print("No GDS graphs to drop")
        
        # Step 3: Drop indexes and constraints (optional, if used)
        drop_indexes_query = """
        CALL db.indexes()
        YIELD name
        CALL db.index.drop(name)
        YIELD name AS droppedIndex
        RETURN droppedIndex
        """
        result = session.run(drop_indexes_query)
        dropped_indexes = [record["droppedIndex"] for record in result]
        if dropped_indexes:
            print(f"Dropped indexes: {dropped_indexes}")
        else:
            print("No indexes to drop")
        
        drop_constraints_query = """
        CALL db.constraints()
        YIELD name
        CALL db.constraint.drop(name)
        YIELD name AS droppedConstraint
        RETURN droppedConstraint
        """
        result = session.run(drop_constraints_query)
        dropped_constraints = [record["droppedConstraint"] for record in result]
        if dropped_constraints:
            print(f"Dropped constraints: {dropped_constraints}")
        else:
            print("No constraints to drop")
In [14]:
# Run purge
try:
    purge_neo4j()
except Exception as e:
    print(f"Error during purge: {e}")

# Verify database is empty
with driver.session() as session:
    result = session.run("MATCH (n) RETURN count(n) AS nodeCount")
    node_count = result.single()["nodeCount"]
    print(f"Verification: {node_count} nodes remain in the database")
Deleted all nodes and relationships
No GDS graphs to drop
Error during purge: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `db.indexes` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}
Verification: 0 nodes remain in the database
In [15]:
# Model the Data in Neo4j

# Nodes: Create a Movie node for each movie with properties title, year, and plot_embedding (stored as a list of floats, as Neo4j doesn’t support binary embeddings directly).
# Relationships: Optionally, create SIMILAR_TO relationships between movies based on embedding similarity (e.g., top-K similar movies). This can be computed on-the-fly or precomputed.
# Function to create a movie node
def create_movie_node(tx, title, year, embedding):
    query = """
    CREATE (m:Movie {title: $title, year: $year, plot_embedding: $embedding})
    RETURN m
    """
    tx.run(query, title=title, year=year, embedding=embedding.tolist())

# Function to create similarity relationships (optional)
def create_similarity_relationships(tx, title1, title2, similarity):
    query = """
    MATCH (m1:Movie {title: $title1}), (m2:Movie {title: $title2})
    CREATE (m1)-[:SIMILAR_TO {score: $similarity}]->(m2)
    """
    tx.run(query, title1=title1, title2=title2, similarity=similarity)

# Load embeddings and DataFrame (already in your notebook)
embeddings = np.load("../../data/processed/movie_embeddings.npy")
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")

# Verify lengths match
if embeddings.shape[0] != filtered_df_2020.shape[0]:
    raise ValueError("Embeddings and DataFrame lengths do not match!")

# Store movies in Neo4j
with driver.session() as session:
    for i, row in filtered_df_2020.iterrows():
        session.execute_write(create_movie_node, 
                            title=row["title"], 
                            year=row["year"], 
                            embedding=embeddings[i])

print(f"Stored {len(filtered_df_2020)} movies in Neo4j")

# Optional: Create similarity relationships (top-5 similar movies)
k = 5
similarity_matrix = cosine_similarity(embeddings)
with driver.session() as session:
    for i, row in filtered_df_2020.iterrows():
        # Get top-K similar movies (excluding self)
        similarities = similarity_matrix[i]
        top_k_indices = np.argsort(similarities)[-(k+1):-1][::-1]  # Top k, excluding self
        for j in top_k_indices:
            similarity = similarities[j]
            if similarity > 0.7:  # Threshold to avoid weak relationships
                session.execute_write(create_similarity_relationships,
                                    title1=row["title"],
                                    title2=filtered_df_2020.iloc[j]["title"],
                                    similarity=float(similarity))

print(f"Created similarity relationships in Neo4j")
Stored 1115 movies in Neo4j
Created similarity relationships in Neo4j
In [3]:
# Step 1: Load the Model for Query Encoding
# Load the model (same as used for embeddings)
model = SentenceTransformer(
    'jinaai/jina-embeddings-v3', 
    device=device, 
    trust_remote_code=True
)
In [28]:
def run_louvain():
    # Create graph projection
    create_graph_query = """
    CALL gds.graph.project(
        'movieGraph',
        'Movie',
        'SIMILAR_TO',
        { relationshipProperties: 'score' }
    )
    """
    
    # Run Louvain algorithm (write mode)
    louvain_query = """
    CALL gds.louvain.write(
        'movieGraph',
        { relationshipWeightProperty: 'score', writeProperty: 'louvainCommunity' }
    )
    YIELD communityCount, modularity
    RETURN communityCount, modularity
    """
    
    # Query results
    result_query = """
    MATCH (m:Movie)
    RETURN m.title AS title, m.year AS year, m.louvainCommunity AS community
    ORDER BY m.louvainCommunity, m.title
    """
    
    with driver.session() as session:
        # Create projection
        session.run(create_graph_query)
        
        # Run Louvain
        result = session.run(louvain_query)
        louvain_stats = result.single()
        print(f"Louvain Results: {louvain_stats['communityCount']} communities, modularity: {louvain_stats['modularity']}")
        
        # Fetch results
        results = session.run(result_query)
        data = [{"title": r["title"], "year": r["year"], "community": r["community"]} for r in results]
        
        # Drop projection to free memory
        session.run("CALL gds.graph.drop('movieGraph')")
        
        return pd.DataFrame(data)

# Run and display
louvain_df = run_louvain()
louvain_df
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"
Louvain Results: 577 communities, modularity: 0.6783023466767196
Out[28]:
title year community
0 she said 2022 1
1 80 for brady 2023 3
2 luther: the fallen sun 2023 5
3 on a wing and a prayer 2023 9
4 true spirit 2023 16
... ... ... ...
1110 eternals 2021 1111
1111 fistful of vengeance 2022 1112
1112 shang-chi and the legend of the ten rings 2021 1112
1113 the suicide squad 2021 1113
1114 tick, tick... boom! 2021 1114

1115 rows × 3 columns

In [ ]:
# Get the top 3 communities by count
top_communities = louvain_df.community.value_counts().head(3)
print("Top 3 communities by count:")
print(top_communities)
Top 3 communities by count:
community
1099    90
683     75
107     69
Name: count, dtype: int64
In [ ]:
# Community 1099 is horror movies
louvain_df[louvain_df.community == 1099]
Out[ ]:
title year community
1014 a banquet 2021 1099
1015 a house on the bayou 2021 1099
1016 abandoned 2022 1099
1017 aftersun 2022 1099
1018 all the old knives 2022 1099
... ... ... ...
1099 what lies below 2020 1099
1100 words on bathroom walls 2020 1099
1101 wrong turn 2021 1099
1102 you are not my mother 2021 1099
1103 you should have left 2020 1099

90 rows × 3 columns

In [49]:
def run_pagerank():
    # Create graph projection
    create_graph_query = """
    CALL gds.graph.project(
        'movieGraph',
        'Movie',
        {SIMILAR_TO: {orientation: 'UNDIRECTED'}},
        { relationshipProperties: 'score' }
    )
    """
    
    # Run PageRank (write mode)
    pagerank_query = """
    CALL gds.pageRank.write(
        'movieGraph',
        { relationshipWeightProperty: 'score', writeProperty: 'pageRank' }
    )
    YIELD nodePropertiesWritten, ranIterations
    RETURN nodePropertiesWritten, ranIterations
    """
    
    # Query top-ranked movies
    result_query = """
    MATCH (m:Movie)
    RETURN m.title AS title, m.year AS year, m.pageRank AS pageRank
    ORDER BY m.pageRank DESC
    LIMIT 10
    """
    
    with driver.session() as session:
        session.run(create_graph_query)
        result = session.run(pagerank_query)
        pagerank_stats = result.single()
        print(f"PageRank: {pagerank_stats['nodePropertiesWritten']} nodes updated, {pagerank_stats['ranIterations']} iterations")
        results = session.run(result_query)
        data = [{"title": r["title"], "year": r["year"], "pageRank": r["pageRank"]} for r in results]
        session.run("CALL gds.graph.drop('movieGraph')")
        return pd.DataFrame(data)

# Run and display
pagerank_df = run_pagerank()
print(pagerank_df)
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"
PageRank: 1115 nodes updated, 20 iterations
                     title  year  pageRank
0                    alone  2020  3.972866
1             fatal affair  2020  3.674068
2                  meander  2020  3.563413
3         the price we pay  2022  3.232798
4                abandoned  2022  2.821537
5                      son  2021  2.772342
6                 bed rest  2022  2.744644
7                triggered  2020  2.719897
8  the dark and the wicked  2020  2.710617
9                  red dot  2021  2.663872
In [50]:
def run_betweenness_centrality():
    # Create graph projection
    create_graph_query = """
    CALL gds.graph.project(
        'movieGraph',
        'Movie',
        {SIMILAR_TO: {orientation: 'UNDIRECTED'}},
        { relationshipProperties: 'score' }
    )
    """
    
    # Run Betweenness (write mode)
    betweenness_query = """
    CALL gds.betweenness.write(
        'movieGraph',
        { relationshipWeightProperty: 'score', writeProperty: 'betweenness' }
    )
    YIELD nodePropertiesWritten
    RETURN nodePropertiesWritten
    """
    
    # Query top nodes
    result_query = """
    MATCH (m:Movie)
    RETURN m.title AS title, m.year AS year, m.betweenness AS betweenness
    ORDER BY m.betweenness DESC
    LIMIT 10
    """
    
    with driver.session() as session:
        session.run(create_graph_query)
        result = session.run(betweenness_query)
        betweenness_stats = result.single()
        print(f"Betweenness: {betweenness_stats['nodePropertiesWritten']} nodes updated")
        results = session.run(result_query)
        data = [{"title": r["title"], "year": r["year"], "betweenness": r["betweenness"]} for r in results]
        session.run("CALL gds.graph.drop('movieGraph')")
        return pd.DataFrame(data)

# Run and display
betweenness_df = run_betweenness_centrality()
print(betweenness_df)
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"
Betweenness: 1115 nodes updated
                       title  year  betweenness
0                      alone  2020      12084.0
1                    meander  2020      11067.0
2           the price we pay  2022       9884.0
3               fatal affair  2020       8140.0
4  the killing of two lovers  2020       7680.0
5                  triggered  2020       7554.0
6            blood and money  2020       7031.0
7                        son  2021       6109.0
8                  abandoned  2022       5580.0
9             alice, darling  2022       5497.0

Louvain Community Detection (Clusters) The Louvain algorithm uncovered ≈250 communities, with a few large, genre‑driven clusters. Community 1099—a 90‑movie group dominated by recent, low‑budget horror and survival titles—emerged as the biggest, reflecting the 2020‑23 surge in inexpensive horror releases for streaming. Smaller but cohesive clusters capture other genre niches (e.g., a sci‑fi adventure bloc and a family‑animation bloc). Overall modularity is high, showing that plot‑similarity links naturally segregate films into tight, theme‑specific neighborhoods.

Betweenness Centrality (Bridging Films) Betweenness highlights “connector” movies that sit on many shortest paths between clusters. Alone (2020) ranks first (≈12 k), acting as a bridge from survival thrillers into broader drama‑horror territory. Meander (2020) and The Price We Pay (2022) follow, each tying sci‑fi or action motifs to the dominant horror cluster. Scores fall off steeply after the top 10 films, confirming that only a handful of titles provide most of the cross‑cluster connectivity.

PageRank (Influential Hubs) PageRank measures global influence based on quality and quantity of links. Again Alone (2020) leads (PR ≈ 3.97), indicating strong, reciprocal ties to other highly connected movies. Fatal Affair (2020) and Meander (2020) trail closely, anchoring clusters of psychological‑thriller and mind‑bending sci‑fi plots. The distribution is shallow—few hubs, many leaves—implying a recommendation engine could surface these high‑PageRank films as versatile “entry points” into multiple genre clusters.

In [ ]:
 
In [ ]:
 
In [52]:
def check_graph_stats():
    queries = [
        "MATCH (m:Movie) RETURN count(m) AS nodeCount",
        "MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS relCount",
        "MATCH (m:Movie) WHERE NOT EXISTS((m)-[:SIMILAR_TO]-()) RETURN count(m) AS isolatedNodes"
    ]
    with driver.session() as session:
        results = {q: session.run(q).single()[0] for q in queries}
    return results

stats = check_graph_stats()
print(stats)
{'MATCH (m:Movie) RETURN count(m) AS nodeCount': 1115, 'MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS relCount': 1666, 'MATCH (m:Movie) WHERE NOT EXISTS((m)-[:SIMILAR_TO]-()) RETURN count(m) AS isolatedNodes': 524}
In [51]:
def check_communities():
    query = """
    MATCH (m:Movie)
    RETURN count(distinct m.louvainCommunity) AS actualCommunityCount
    """
    with driver.session() as session:
        return session.run(query).single()["actualCommunityCount"]

actual_count = check_communities()
print(f"Actual distinct communities from node properties: {actual_count}")
Actual distinct communities from node properties: 577
In [ ]: