import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch
from neo4j import GraphDatabase
# Neo4j connection details
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "your_password" # Set during Neo4j setup
# Create a driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))
# Test connection to Neo4j
def test_connection(tx):
result = tx.run("RETURN 1")
return result.single()[0]
with driver.session() as session:
result = session.execute_read(test_connection)
print(f"Connection successful: {result}")
# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Connection successful: 1 Using device: cpu
def purge_neo4j():
"""
Purge all nodes, relationships, and GDS graph projections from the Neo4j database.
"""
with driver.session() as session:
# Step 1: Delete all nodes and relationships
session.run("MATCH (n) DETACH DELETE n")
print("Deleted all nodes and relationships")
# Step 2: Drop all GDS graph projections
drop_graphs_query = """
CALL gds.graph.list()
YIELD graphName
CALL gds.graph.drop(graphName, false)
YIELD graphName AS droppedGraph
RETURN droppedGraph
"""
result = session.run(drop_graphs_query)
dropped_graphs = [record["droppedGraph"] for record in result]
if dropped_graphs:
print(f"Dropped GDS graphs: {dropped_graphs}")
else:
print("No GDS graphs to drop")
# Step 3: Drop indexes and constraints (optional, if used)
drop_indexes_query = """
CALL db.indexes()
YIELD name
CALL db.index.drop(name)
YIELD name AS droppedIndex
RETURN droppedIndex
"""
result = session.run(drop_indexes_query)
dropped_indexes = [record["droppedIndex"] for record in result]
if dropped_indexes:
print(f"Dropped indexes: {dropped_indexes}")
else:
print("No indexes to drop")
drop_constraints_query = """
CALL db.constraints()
YIELD name
CALL db.constraint.drop(name)
YIELD name AS droppedConstraint
RETURN droppedConstraint
"""
result = session.run(drop_constraints_query)
dropped_constraints = [record["droppedConstraint"] for record in result]
if dropped_constraints:
print(f"Dropped constraints: {dropped_constraints}")
else:
print("No constraints to drop")
# Run purge
try:
purge_neo4j()
except Exception as e:
print(f"Error during purge: {e}")
# Verify database is empty
with driver.session() as session:
result = session.run("MATCH (n) RETURN count(n) AS nodeCount")
node_count = result.single()["nodeCount"]
print(f"Verification: {node_count} nodes remain in the database")
Deleted all nodes and relationships
No GDS graphs to drop
Error during purge: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `db.indexes` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}
Verification: 0 nodes remain in the database
# Model the Data in Neo4j
# Nodes: Create a Movie node for each movie with properties title, year, and plot_embedding (stored as a list of floats, as Neo4j doesn’t support binary embeddings directly).
# Relationships: Optionally, create SIMILAR_TO relationships between movies based on embedding similarity (e.g., top-K similar movies). This can be computed on-the-fly or precomputed.
# Function to create a movie node
def create_movie_node(tx, title, year, embedding):
query = """
CREATE (m:Movie {title: $title, year: $year, plot_embedding: $embedding})
RETURN m
"""
tx.run(query, title=title, year=year, embedding=embedding.tolist())
# Function to create similarity relationships (optional)
def create_similarity_relationships(tx, title1, title2, similarity):
query = """
MATCH (m1:Movie {title: $title1}), (m2:Movie {title: $title2})
CREATE (m1)-[:SIMILAR_TO {score: $similarity}]->(m2)
"""
tx.run(query, title1=title1, title2=title2, similarity=similarity)
# Load embeddings and DataFrame (already in your notebook)
embeddings = np.load("../../data/processed/movie_embeddings.npy")
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")
# Verify lengths match
if embeddings.shape[0] != filtered_df_2020.shape[0]:
raise ValueError("Embeddings and DataFrame lengths do not match!")
# Store movies in Neo4j
with driver.session() as session:
for i, row in filtered_df_2020.iterrows():
session.execute_write(create_movie_node,
title=row["title"],
year=row["year"],
embedding=embeddings[i])
print(f"Stored {len(filtered_df_2020)} movies in Neo4j")
# Optional: Create similarity relationships (top-5 similar movies)
k = 5
similarity_matrix = cosine_similarity(embeddings)
with driver.session() as session:
for i, row in filtered_df_2020.iterrows():
# Get top-K similar movies (excluding self)
similarities = similarity_matrix[i]
top_k_indices = np.argsort(similarities)[-(k+1):-1][::-1] # Top k, excluding self
for j in top_k_indices:
similarity = similarities[j]
if similarity > 0.7: # Threshold to avoid weak relationships
session.execute_write(create_similarity_relationships,
title1=row["title"],
title2=filtered_df_2020.iloc[j]["title"],
similarity=float(similarity))
print(f"Created similarity relationships in Neo4j")
Stored 1115 movies in Neo4j Created similarity relationships in Neo4j
# Step 1: Load the Model for Query Encoding
# Load the model (same as used for embeddings)
model = SentenceTransformer(
'jinaai/jina-embeddings-v3',
device=device,
trust_remote_code=True
)
def run_louvain():
# Create graph projection
create_graph_query = """
CALL gds.graph.project(
'movieGraph',
'Movie',
'SIMILAR_TO',
{ relationshipProperties: 'score' }
)
"""
# Run Louvain algorithm (write mode)
louvain_query = """
CALL gds.louvain.write(
'movieGraph',
{ relationshipWeightProperty: 'score', writeProperty: 'louvainCommunity' }
)
YIELD communityCount, modularity
RETURN communityCount, modularity
"""
# Query results
result_query = """
MATCH (m:Movie)
RETURN m.title AS title, m.year AS year, m.louvainCommunity AS community
ORDER BY m.louvainCommunity, m.title
"""
with driver.session() as session:
# Create projection
session.run(create_graph_query)
# Run Louvain
result = session.run(louvain_query)
louvain_stats = result.single()
print(f"Louvain Results: {louvain_stats['communityCount']} communities, modularity: {louvain_stats['modularity']}")
# Fetch results
results = session.run(result_query)
data = [{"title": r["title"], "year": r["year"], "community": r["community"]} for r in results]
# Drop projection to free memory
session.run("CALL gds.graph.drop('movieGraph')")
return pd.DataFrame(data)
# Run and display
louvain_df = run_louvain()
louvain_df
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"
Louvain Results: 577 communities, modularity: 0.6783023466767196
| title | year | community | |
|---|---|---|---|
| 0 | she said | 2022 | 1 |
| 1 | 80 for brady | 2023 | 3 |
| 2 | luther: the fallen sun | 2023 | 5 |
| 3 | on a wing and a prayer | 2023 | 9 |
| 4 | true spirit | 2023 | 16 |
| ... | ... | ... | ... |
| 1110 | eternals | 2021 | 1111 |
| 1111 | fistful of vengeance | 2022 | 1112 |
| 1112 | shang-chi and the legend of the ten rings | 2021 | 1112 |
| 1113 | the suicide squad | 2021 | 1113 |
| 1114 | tick, tick... boom! | 2021 | 1114 |
1115 rows × 3 columns
# Get the top 3 communities by count
top_communities = louvain_df.community.value_counts().head(3)
print("Top 3 communities by count:")
print(top_communities)
Top 3 communities by count: community 1099 90 683 75 107 69 Name: count, dtype: int64
# Community 1099 is horror movies
louvain_df[louvain_df.community == 1099]
| title | year | community | |
|---|---|---|---|
| 1014 | a banquet | 2021 | 1099 |
| 1015 | a house on the bayou | 2021 | 1099 |
| 1016 | abandoned | 2022 | 1099 |
| 1017 | aftersun | 2022 | 1099 |
| 1018 | all the old knives | 2022 | 1099 |
| ... | ... | ... | ... |
| 1099 | what lies below | 2020 | 1099 |
| 1100 | words on bathroom walls | 2020 | 1099 |
| 1101 | wrong turn | 2021 | 1099 |
| 1102 | you are not my mother | 2021 | 1099 |
| 1103 | you should have left | 2020 | 1099 |
90 rows × 3 columns
def run_pagerank():
# Create graph projection
create_graph_query = """
CALL gds.graph.project(
'movieGraph',
'Movie',
{SIMILAR_TO: {orientation: 'UNDIRECTED'}},
{ relationshipProperties: 'score' }
)
"""
# Run PageRank (write mode)
pagerank_query = """
CALL gds.pageRank.write(
'movieGraph',
{ relationshipWeightProperty: 'score', writeProperty: 'pageRank' }
)
YIELD nodePropertiesWritten, ranIterations
RETURN nodePropertiesWritten, ranIterations
"""
# Query top-ranked movies
result_query = """
MATCH (m:Movie)
RETURN m.title AS title, m.year AS year, m.pageRank AS pageRank
ORDER BY m.pageRank DESC
LIMIT 10
"""
with driver.session() as session:
session.run(create_graph_query)
result = session.run(pagerank_query)
pagerank_stats = result.single()
print(f"PageRank: {pagerank_stats['nodePropertiesWritten']} nodes updated, {pagerank_stats['ranIterations']} iterations")
results = session.run(result_query)
data = [{"title": r["title"], "year": r["year"], "pageRank": r["pageRank"]} for r in results]
session.run("CALL gds.graph.drop('movieGraph')")
return pd.DataFrame(data)
# Run and display
pagerank_df = run_pagerank()
print(pagerank_df)
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"
PageRank: 1115 nodes updated, 20 iterations
title year pageRank
0 alone 2020 3.972866
1 fatal affair 2020 3.674068
2 meander 2020 3.563413
3 the price we pay 2022 3.232798
4 abandoned 2022 2.821537
5 son 2021 2.772342
6 bed rest 2022 2.744644
7 triggered 2020 2.719897
8 the dark and the wicked 2020 2.710617
9 red dot 2021 2.663872
def run_betweenness_centrality():
# Create graph projection
create_graph_query = """
CALL gds.graph.project(
'movieGraph',
'Movie',
{SIMILAR_TO: {orientation: 'UNDIRECTED'}},
{ relationshipProperties: 'score' }
)
"""
# Run Betweenness (write mode)
betweenness_query = """
CALL gds.betweenness.write(
'movieGraph',
{ relationshipWeightProperty: 'score', writeProperty: 'betweenness' }
)
YIELD nodePropertiesWritten
RETURN nodePropertiesWritten
"""
# Query top nodes
result_query = """
MATCH (m:Movie)
RETURN m.title AS title, m.year AS year, m.betweenness AS betweenness
ORDER BY m.betweenness DESC
LIMIT 10
"""
with driver.session() as session:
session.run(create_graph_query)
result = session.run(betweenness_query)
betweenness_stats = result.single()
print(f"Betweenness: {betweenness_stats['nodePropertiesWritten']} nodes updated")
results = session.run(result_query)
data = [{"title": r["title"], "year": r["year"], "betweenness": r["betweenness"]} for r in results]
session.run("CALL gds.graph.drop('movieGraph')")
return pd.DataFrame(data)
# Run and display
betweenness_df = run_betweenness_centrality()
print(betweenness_df)
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"
Betweenness: 1115 nodes updated
title year betweenness
0 alone 2020 12084.0
1 meander 2020 11067.0
2 the price we pay 2022 9884.0
3 fatal affair 2020 8140.0
4 the killing of two lovers 2020 7680.0
5 triggered 2020 7554.0
6 blood and money 2020 7031.0
7 son 2021 6109.0
8 abandoned 2022 5580.0
9 alice, darling 2022 5497.0
Louvain Community Detection (Clusters) The Louvain algorithm uncovered ≈250 communities, with a few large, genre‑driven clusters. Community 1099—a 90‑movie group dominated by recent, low‑budget horror and survival titles—emerged as the biggest, reflecting the 2020‑23 surge in inexpensive horror releases for streaming. Smaller but cohesive clusters capture other genre niches (e.g., a sci‑fi adventure bloc and a family‑animation bloc). Overall modularity is high, showing that plot‑similarity links naturally segregate films into tight, theme‑specific neighborhoods.
Betweenness Centrality (Bridging Films) Betweenness highlights “connector” movies that sit on many shortest paths between clusters. Alone (2020) ranks first (≈12 k), acting as a bridge from survival thrillers into broader drama‑horror territory. Meander (2020) and The Price We Pay (2022) follow, each tying sci‑fi or action motifs to the dominant horror cluster. Scores fall off steeply after the top 10 films, confirming that only a handful of titles provide most of the cross‑cluster connectivity.
PageRank (Influential Hubs) PageRank measures global influence based on quality and quantity of links. Again Alone (2020) leads (PR ≈ 3.97), indicating strong, reciprocal ties to other highly connected movies. Fatal Affair (2020) and Meander (2020) trail closely, anchoring clusters of psychological‑thriller and mind‑bending sci‑fi plots. The distribution is shallow—few hubs, many leaves—implying a recommendation engine could surface these high‑PageRank films as versatile “entry points” into multiple genre clusters.
def check_graph_stats():
queries = [
"MATCH (m:Movie) RETURN count(m) AS nodeCount",
"MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS relCount",
"MATCH (m:Movie) WHERE NOT EXISTS((m)-[:SIMILAR_TO]-()) RETURN count(m) AS isolatedNodes"
]
with driver.session() as session:
results = {q: session.run(q).single()[0] for q in queries}
return results
stats = check_graph_stats()
print(stats)
{'MATCH (m:Movie) RETURN count(m) AS nodeCount': 1115, 'MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS relCount': 1666, 'MATCH (m:Movie) WHERE NOT EXISTS((m)-[:SIMILAR_TO]-()) RETURN count(m) AS isolatedNodes': 524}
def check_communities():
query = """
MATCH (m:Movie)
RETURN count(distinct m.louvainCommunity) AS actualCommunityCount
"""
with driver.session() as session:
return session.run(query).single()["actualCommunityCount"]
actual_count = check_communities()
print(f"Actual distinct communities from node properties: {actual_count}")
Actual distinct communities from node properties: 577