import pandas as pd
import numpy as np

import redis
from redis.commands.search.field import VectorField, TextField, NumericField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.express as px

from sentence_transformers import SentenceTransformer
import torch

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Using device: cpu

# Load the data
embeddings = np.load("../../data/processed/movie_embeddings.npy")
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")
print(f'Shape of embeddings: {embeddings.shape}')

print(f"Number of rows in filtered_df_2020: {filtered_df_2020.shape[0]}")
filtered_df_2020.head()

Shape of embeddings: (1115, 1024)
Number of rows in filtered_df_2020: 1115

# Connect to Redis, make sure Redis is running
client = redis.Redis(host='localhost', port=6379, decode_responses=True)
print(client.ping())

# Check redis version
print(f"Redis version: {redis.__version__}")

# Check if Redisearch module is loaded
modules = client.execute_command("MODULE", "LIST")
print(modules)

# Check if index exists and delete if it does
try:
    client.ft("movie_idx").dropindex(delete_documents=False)
    print("Dropped existing index")
except:
    print("Index didn't exist yet")

True
Redis version: 5.0.8
[['name', 'search', 'ver', 21020, 'path', '/opt/redis-stack/lib/redisearch.so', 'args', []]]
Dropped existing index

# Load the embeddings and filtered DataFrame to Redis
# Ensure the embeddings match the DataFrame length
if embeddings.shape[0] != filtered_df_2020.shape[0]:
    raise ValueError("Embeddings and DataFrame lengths do not match!")

# Define schema with correct dimension
schema = (
    TextField("title"),
    NumericField("year"),
    VectorField("plot_embedding", "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": 1024,  # jina-embeddings-v3 dimension
        "DISTANCE_METRIC": "COSINE"
    })
)

# Create index
index_def = IndexDefinition(prefix=["movie:"], index_type=IndexType.HASH)
client.ft("movie_idx").create_index(schema, definition=index_def)
print("Created new index")

# Store data
for i, row in filtered_df_2020.iterrows():
    key = f"movie:{i}"
    client.hset(key, mapping={
        "title": row["title"],
        "year": row["year"],
        "plot_embedding": embeddings[i].astype(np.float32).tobytes()
    })

print(f"Stored {len(filtered_df_2020)} movies in Redis")

# Verify storage
info = client.ft("movie_idx").info()
print(f"Index contains {info['num_docs']} documents")
print("Redisearch index info:", info)

Created new index

Stored 1115 movies in Redis
Index contains 1115 documents
Redisearch index info: {'index_name': 'movie_idx', 'index_options': [], 'index_definition': ['key_type', 'HASH', 'prefixes', ['movie:'], 'default_score', '1'], 'attributes': [['identifier', 'title', 'attribute', 'title', 'type', 'TEXT', 'WEIGHT', '1'], ['identifier', 'year', 'attribute', 'year', 'type', 'NUMERIC'], ['identifier', 'plot_embedding', 'attribute', 'plot_embedding', 'type', 'VECTOR', 'algorithm', 'FLAT', 'data_type', 'FLOAT32', 'dim', 1024, 'distance_metric', 'COSINE']], 'num_docs': 1115, 'max_doc_id': 2230, 'num_terms': 2059, 'num_records': 10434, 'inverted_sz_mb': '0.23740291595458984', 'vector_index_sz_mb': '8.066841125488281', 'total_inverted_index_blocks': 2083, 'offset_vectors_sz_mb': '0.005733489990234375', 'doc_table_size_mb': '0.11417961120605469', 'sortable_values_size_mb': '0', 'key_table_size_mb': '0.033768653869628906', 'tag_overhead_sz_mb': '0', 'text_overhead_sz_mb': '0.0687265396118164', 'total_index_memory_sz_mb': '0.47181129455566406', 'geoshapes_sz_mb': '0', 'records_per_doc_avg': '9.357847213745117', 'bytes_per_record_avg': '23.858060836791992', 'offsets_per_term_avg': '0.5761932134628296', 'offset_bits_per_record_avg': '8', 'hash_indexing_failures': 0, 'total_indexing_time': '51.659000396728516', 'indexing': 0, 'percent_indexed': '1', 'number_of_uses': 1, 'cleaning': 0, 'gc_stats': ['bytes_collected', '0', 'total_ms_run', '0', 'total_cycles', '0', 'average_cycle_time_ms', 'nan', 'last_run_time_ms', '0', 'gc_numeric_trees_missed', '0', 'gc_blocks_denied', '0'], 'cursor_stats': ['global_idle', 0, 'global_total', 0, 'index_capacity', 128, 'index_total', 0], 'dialect_stats': ['dialect_1', 0, 'dialect_2', 0, 'dialect_3', 0, 'dialect_4', 0], 'Index Errors': ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A', 'background indexing status', 'OK'], 'field statistics': [['identifier', 'title', 'attribute', 'title', 'Index Errors', ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A']], ['identifier', 'year', 'attribute', 'year', 'Index Errors', ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A']], ['identifier', 'plot_embedding', 'attribute', 'plot_embedding', 'Index Errors', ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A']]]}

# Load the model (same as used for embeddings)
model = SentenceTransformer(
    'jinaai/jina-embeddings-v3', 
    device=device, 
    trust_remote_code=True
)

# Function to find similar movies based on a query text
def find_similar_movies(query_text: str, k: int = 5, exclude_title: str = None):
    """
    Find top K similar movies based on a query text.
    
    Args:
        query_text: Plot description or keywords to search for.
        k: Number of results to return.
        exclude_title: Optional title to exclude (e.g., if querying with an existing movie).
    
    Returns:
        List of dicts with title, year, and similarity score.
    """
    # Encode the query
    query_embedding = model.encode([query_text], normalize_embeddings=True)[0].astype(np.float32).tobytes()
    
    # Build the query: KNN on plot_embedding, sort by score (lower is more similar for COSINE)
    base_query = f"*=>[KNN {k} @plot_embedding $vec AS vector_score]"
    query = (
        Query(base_query)
        .sort_by("vector_score", asc=True)  # Ascending for COSINE (0 = identical)
        .return_fields("title", "year", "vector_score")
        .paging(0, k)
        .dialect(2)
    )
    
    # Execute search
    results = client.ft("movie_idx").search(query, {"vec": query_embedding})
    
    # Process results
    similar_movies = []
    for doc in results.docs:
        title = doc.title
        if exclude_title and title.lower() == exclude_title.lower():
            continue  # Skip if excluding
        similar_movies.append({
            "title": title,
            "year": int(doc.year),
            "similarity_score": 1 - float(doc.vector_score)  # Convert to similarity (1 = identical)
        })
    
    return similar_movies

# Embedding text before querying
query_text = "zombie apocalypse in a city, escape from hordes of undead"
similar = find_similar_movies(query_text, k=5)

print(f"Top similar movies: with query '{query_text}':")
pd.DataFrame(similar)

Top similar movies: with query 'zombie apocalypse in a city, escape from hordes of undead':

# Pick a movie from your DataFrame (e.g., index 0)
movie_idx = 716

# Print the title to exclude
search_title = filtered_df_2020.loc[movie_idx, 'title']

query_text = filtered_df_2020.loc[movie_idx, 'plot']

print(f"Querying similar movies for a James Bond movie: {search_title}")

# k=6 to account for excluding one; exclude the movie itself
similar = find_similar_movies(query_text, k=6, exclude_title=search_title)  
pd.DataFrame(similar)

Querying similar movies for a James Bond movie: no time to die

# Visualize the embeddings using t-SNE
print("Visualizing movie embeddings using t-SNE...")
# Cluster embeddings into 5 groups
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Add clusters to DataFrame for reference
filtered_df_2020['cluster'] = clusters

# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
tsne_emb = tsne.fit_transform(embeddings)

# Create scatter plot with Plotly, using discrete colors for clusters and custom hover text
fig = px.scatter(
    x=tsne_emb[:, 0],
    y=tsne_emb[:, 1],
    color=filtered_df_2020['cluster'].astype(str),  # Convert to string for discrete colors
    hover_data={'title': filtered_df_2020['title']},  # Custom hover to show only title
    labels={'color': 'Cluster'},  # Label for the legend
    title="t-SNE Visualization of Movie Embeddings Colored by Cluster"
)

# Update layout to improve readability and set plot size to 8x6 inches
fig.update_traces(
    hovertemplate="title: %{customdata[0]}<extra></extra>"  # Custom hover template to show only title
)
fig.update_layout(
    showlegend=True,
    coloraxis_colorbar_title="Cluster",
    width=800,  # 8 inches at 100 pixels per inch
    height=600  # 6 inches at 100 pixels per inch
)

print("Plotting t-SNE visualization of movie embeddings colored by cluster")
print("This is a interactive plot, hover over points to see movie titles.")
# Save a html file for interactive viewing, this can be downloaded and viewed in a browser
fig.write_html("movie_embeddings_tsne.html")

# This won't be showing on the git
fig.show()

Visualizing movie embeddings using t-SNE...
Plotting t-SNE visualization of movie embeddings colored by cluster
This is a interactive plot, hover over points to see movie titles.

# Also show a static plot for quick reference
fig_static = px.scatter(
    x=tsne_emb[:, 0],
    y=tsne_emb[:, 1],
    color=filtered_df_2020['cluster'].astype(str),
    hover_data={'title': filtered_df_2020['title']},
    labels={'color': 'Cluster'},
    title="t-SNE Visualization of Movie Embeddings Colored by Cluster"
)
fig_static.update_traces(
    hovertemplate="title: %{customdata[0]}<extra></extra>"  # Custom hover template to show only title
)
fig_static.update_layout(
    showlegend=True,
    coloraxis_colorbar_title="Cluster",
    width=800,  # 8 inches at 100 pixels per inch
    height=600  # 6 inches at 100 pixels per inch
)

	title	year	plot
0	the good house	2021	Hildy Good, once the most prosperous realtor ...
1	she said	2022	In 2017, 'New York Times' reporter Jodi Kanto...
2	the menu	2022	Foodie Tyler Ledford and his date, Margot Mil...
3	80 for brady	2023	In 2017, best friends Lou, Trish, Maura, and ...
4	the last kingdom: seven kings must die	2023	Olaf Guthfrithson Anlaf, a Viking king from I...

	title	year	similarity_score
0	army of the dead	2021	0.690218
1	peninsula	2020	0.643609
2	unhuman	2022	0.642859
3	escape room: tournament of champions	2021	0.618451
4	sky sharks	2020	0.618377

	title	year	similarity_score
0	trigger point	2021	0.691921
1	sas: red notice	2021	0.686741
2	the rhythm section	2020	0.653405
3	gunpowder milkshake	2021	0.646881
4	the postcard killings	2020	0.644570