import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch

from neo4j import GraphDatabase

# Neo4j connection details
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "your_password"  # Set during Neo4j setup

# Create a driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))


# Test connection to Neo4j
def test_connection(tx):
    result = tx.run("RETURN 1")
    return result.single()[0]

with driver.session() as session:
    result = session.execute_read(test_connection)
    print(f"Connection successful: {result}")

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Connection successful: 1
Using device: cpu

def purge_neo4j():
    """
    Purge all nodes, relationships, and GDS graph projections from the Neo4j database.
    """
    with driver.session() as session:
        # Step 1: Delete all nodes and relationships
        session.run("MATCH (n) DETACH DELETE n")
        print("Deleted all nodes and relationships")
        
        # Step 2: Drop all GDS graph projections
        drop_graphs_query = """
        CALL gds.graph.list()
        YIELD graphName
        CALL gds.graph.drop(graphName, false)
        YIELD graphName AS droppedGraph
        RETURN droppedGraph
        """
        result = session.run(drop_graphs_query)
        dropped_graphs = [record["droppedGraph"] for record in result]
        if dropped_graphs:
            print(f"Dropped GDS graphs: {dropped_graphs}")
        else:
            print("No GDS graphs to drop")
        
        # Step 3: Drop indexes and constraints (optional, if used)
        drop_indexes_query = """
        CALL db.indexes()
        YIELD name
        CALL db.index.drop(name)
        YIELD name AS droppedIndex
        RETURN droppedIndex
        """
        result = session.run(drop_indexes_query)
        dropped_indexes = [record["droppedIndex"] for record in result]
        if dropped_indexes:
            print(f"Dropped indexes: {dropped_indexes}")
        else:
            print("No indexes to drop")
        
        drop_constraints_query = """
        CALL db.constraints()
        YIELD name
        CALL db.constraint.drop(name)
        YIELD name AS droppedConstraint
        RETURN droppedConstraint
        """
        result = session.run(drop_constraints_query)
        dropped_constraints = [record["droppedConstraint"] for record in result]
        if dropped_constraints:
            print(f"Dropped constraints: {dropped_constraints}")
        else:
            print("No constraints to drop")

# Run purge
try:
    purge_neo4j()
except Exception as e:
    print(f"Error during purge: {e}")

# Verify database is empty
with driver.session() as session:
    result = session.run("MATCH (n) RETURN count(n) AS nodeCount")
    node_count = result.single()["nodeCount"]
    print(f"Verification: {node_count} nodes remain in the database")

Deleted all nodes and relationships
No GDS graphs to drop
Error during purge: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `db.indexes` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}
Verification: 0 nodes remain in the database

# Model the Data in Neo4j

# Nodes: Create a Movie node for each movie with properties title, year, and plot_embedding (stored as a list of floats, as Neo4j doesn’t support binary embeddings directly).
# Relationships: Optionally, create SIMILAR_TO relationships between movies based on embedding similarity (e.g., top-K similar movies). This can be computed on-the-fly or precomputed.
# Function to create a movie node
def create_movie_node(tx, title, year, embedding):
    query = """
    CREATE (m:Movie {title: $title, year: $year, plot_embedding: $embedding})
    RETURN m
    """
    tx.run(query, title=title, year=year, embedding=embedding.tolist())

# Function to create similarity relationships (optional)
def create_similarity_relationships(tx, title1, title2, similarity):
    query = """
    MATCH (m1:Movie {title: $title1}), (m2:Movie {title: $title2})
    CREATE (m1)-[:SIMILAR_TO {score: $similarity}]->(m2)
    """
    tx.run(query, title1=title1, title2=title2, similarity=similarity)

# Load embeddings and DataFrame (already in your notebook)
embeddings = np.load("../../data/processed/movie_embeddings.npy")
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")

# Verify lengths match
if embeddings.shape[0] != filtered_df_2020.shape[0]:
    raise ValueError("Embeddings and DataFrame lengths do not match!")

# Store movies in Neo4j
with driver.session() as session:
    for i, row in filtered_df_2020.iterrows():
        session.execute_write(create_movie_node, 
                            title=row["title"], 
                            year=row["year"], 
                            embedding=embeddings[i])

print(f"Stored {len(filtered_df_2020)} movies in Neo4j")

# Optional: Create similarity relationships (top-5 similar movies)
k = 5
similarity_matrix = cosine_similarity(embeddings)
with driver.session() as session:
    for i, row in filtered_df_2020.iterrows():
        # Get top-K similar movies (excluding self)
        similarities = similarity_matrix[i]
        top_k_indices = np.argsort(similarities)[-(k+1):-1][::-1]  # Top k, excluding self
        for j in top_k_indices:
            similarity = similarities[j]
            if similarity > 0.7:  # Threshold to avoid weak relationships
                session.execute_write(create_similarity_relationships,
                                    title1=row["title"],
                                    title2=filtered_df_2020.iloc[j]["title"],
                                    similarity=float(similarity))

print(f"Created similarity relationships in Neo4j")

Stored 1115 movies in Neo4j
Created similarity relationships in Neo4j

# Step 1: Load the Model for Query Encoding
# Load the model (same as used for embeddings)
model = SentenceTransformer(
    'jinaai/jina-embeddings-v3', 
    device=device, 
    trust_remote_code=True
)

def run_louvain():
    # Create graph projection
    create_graph_query = """
    CALL gds.graph.project(
        'movieGraph',
        'Movie',
        'SIMILAR_TO',
        { relationshipProperties: 'score' }
    )
    """
    
    # Run Louvain algorithm (write mode)
    louvain_query = """
    CALL gds.louvain.write(
        'movieGraph',
        { relationshipWeightProperty: 'score', writeProperty: 'louvainCommunity' }
    )
    YIELD communityCount, modularity
    RETURN communityCount, modularity
    """
    
    # Query results
    result_query = """
    MATCH (m:Movie)
    RETURN m.title AS title, m.year AS year, m.louvainCommunity AS community
    ORDER BY m.louvainCommunity, m.title
    """
    
    with driver.session() as session:
        # Create projection
        session.run(create_graph_query)
        
        # Run Louvain
        result = session.run(louvain_query)
        louvain_stats = result.single()
        print(f"Louvain Results: {louvain_stats['communityCount']} communities, modularity: {louvain_stats['modularity']}")
        
        # Fetch results
        results = session.run(result_query)
        data = [{"title": r["title"], "year": r["year"], "community": r["community"]} for r in results]
        
        # Drop projection to free memory
        session.run("CALL gds.graph.drop('movieGraph')")
        
        return pd.DataFrame(data)

# Run and display
louvain_df = run_louvain()
louvain_df

Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"

Louvain Results: 577 communities, modularity: 0.6783023466767196

# Get the top 3 communities by count
top_communities = louvain_df.community.value_counts().head(3)
print("Top 3 communities by count:")
print(top_communities)

Top 3 communities by count:
community
1099    90
683     75
107     69
Name: count, dtype: int64

# Community 1099 is horror movies
louvain_df[louvain_df.community == 1099]

def run_pagerank():
    # Create graph projection
    create_graph_query = """
    CALL gds.graph.project(
        'movieGraph',
        'Movie',
        {SIMILAR_TO: {orientation: 'UNDIRECTED'}},
        { relationshipProperties: 'score' }
    )
    """
    
    # Run PageRank (write mode)
    pagerank_query = """
    CALL gds.pageRank.write(
        'movieGraph',
        { relationshipWeightProperty: 'score', writeProperty: 'pageRank' }
    )
    YIELD nodePropertiesWritten, ranIterations
    RETURN nodePropertiesWritten, ranIterations
    """
    
    # Query top-ranked movies
    result_query = """
    MATCH (m:Movie)
    RETURN m.title AS title, m.year AS year, m.pageRank AS pageRank
    ORDER BY m.pageRank DESC
    LIMIT 10
    """
    
    with driver.session() as session:
        session.run(create_graph_query)
        result = session.run(pagerank_query)
        pagerank_stats = result.single()
        print(f"PageRank: {pagerank_stats['nodePropertiesWritten']} nodes updated, {pagerank_stats['ranIterations']} iterations")
        results = session.run(result_query)
        data = [{"title": r["title"], "year": r["year"], "pageRank": r["pageRank"]} for r in results]
        session.run("CALL gds.graph.drop('movieGraph')")
        return pd.DataFrame(data)

# Run and display
pagerank_df = run_pagerank()
print(pagerank_df)

Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"

PageRank: 1115 nodes updated, 20 iterations
                     title  year  pageRank
0                    alone  2020  3.972866
1             fatal affair  2020  3.674068
2                  meander  2020  3.563413
3         the price we pay  2022  3.232798
4                abandoned  2022  2.821537
5                      son  2021  2.772342
6                 bed rest  2022  2.744644
7                triggered  2020  2.719897
8  the dark and the wicked  2020  2.710617
9                  red dot  2021  2.663872

def run_betweenness_centrality():
    # Create graph projection
    create_graph_query = """
    CALL gds.graph.project(
        'movieGraph',
        'Movie',
        {SIMILAR_TO: {orientation: 'UNDIRECTED'}},
        { relationshipProperties: 'score' }
    )
    """
    
    # Run Betweenness (write mode)
    betweenness_query = """
    CALL gds.betweenness.write(
        'movieGraph',
        { relationshipWeightProperty: 'score', writeProperty: 'betweenness' }
    )
    YIELD nodePropertiesWritten
    RETURN nodePropertiesWritten
    """
    
    # Query top nodes
    result_query = """
    MATCH (m:Movie)
    RETURN m.title AS title, m.year AS year, m.betweenness AS betweenness
    ORDER BY m.betweenness DESC
    LIMIT 10
    """
    
    with driver.session() as session:
        session.run(create_graph_query)
        result = session.run(betweenness_query)
        betweenness_stats = result.single()
        print(f"Betweenness: {betweenness_stats['nodePropertiesWritten']} nodes updated")
        results = session.run(result_query)
        data = [{"title": r["title"], "year": r["year"], "betweenness": r["betweenness"]} for r in results]
        session.run("CALL gds.graph.drop('movieGraph')")
        return pd.DataFrame(data)

# Run and display
betweenness_df = run_betweenness_centrality()
print(betweenness_df)

Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('movieGraph')"

Betweenness: 1115 nodes updated
                       title  year  betweenness
0                      alone  2020      12084.0
1                    meander  2020      11067.0
2           the price we pay  2022       9884.0
3               fatal affair  2020       8140.0
4  the killing of two lovers  2020       7680.0
5                  triggered  2020       7554.0
6            blood and money  2020       7031.0
7                        son  2021       6109.0
8                  abandoned  2022       5580.0
9             alice, darling  2022       5497.0

def check_graph_stats():
    queries = [
        "MATCH (m:Movie) RETURN count(m) AS nodeCount",
        "MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS relCount",
        "MATCH (m:Movie) WHERE NOT EXISTS((m)-[:SIMILAR_TO]-()) RETURN count(m) AS isolatedNodes"
    ]
    with driver.session() as session:
        results = {q: session.run(q).single()[0] for q in queries}
    return results

stats = check_graph_stats()
print(stats)

{'MATCH (m:Movie) RETURN count(m) AS nodeCount': 1115, 'MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS relCount': 1666, 'MATCH (m:Movie) WHERE NOT EXISTS((m)-[:SIMILAR_TO]-()) RETURN count(m) AS isolatedNodes': 524}

def check_communities():
    query = """
    MATCH (m:Movie)
    RETURN count(distinct m.louvainCommunity) AS actualCommunityCount
    """
    with driver.session() as session:
        return session.run(query).single()["actualCommunityCount"]

actual_count = check_communities()
print(f"Actual distinct communities from node properties: {actual_count}")

Actual distinct communities from node properties: 577

	title	year	community
0	she said	2022	1
1	80 for brady	2023	3
2	luther: the fallen sun	2023	5
3	on a wing and a prayer	2023	9
4	true spirit	2023	16
...	...	...	...
1110	eternals	2021	1111
1111	fistful of vengeance	2022	1112
1112	shang-chi and the legend of the ten rings	2021	1112
1113	the suicide squad	2021	1113
1114	tick, tick... boom!	2021	1114

	title	year	community
1014	a banquet	2021	1099
1015	a house on the bayou	2021	1099
1016	abandoned	2022	1099
1017	aftersun	2022	1099
1018	all the old knives	2022	1099
...	...	...	...
1099	what lies below	2020	1099
1100	words on bathroom walls	2020	1099
1101	wrong turn	2021	1099
1102	you are not my mother	2021	1099
1103	you should have left	2020	1099