In [2]:
import pandas as pd
import numpy as np
import redis
from redis.commands.search.field import VectorField, TextField, NumericField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.express as px
from sentence_transformers import SentenceTransformer
import torch
# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Using device: cpu
In [3]:
# Load the data
embeddings = np.load("../../data/processed/movie_embeddings.npy")
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")
print(f'Shape of embeddings: {embeddings.shape}')
print(f"Number of rows in filtered_df_2020: {filtered_df_2020.shape[0]}")
filtered_df_2020.head()
Shape of embeddings: (1115, 1024) Number of rows in filtered_df_2020: 1115
Out[3]:
| title | year | plot | |
|---|---|---|---|
| 0 | the good house | 2021 | Hildy Good, once the most prosperous realtor ... |
| 1 | she said | 2022 | In 2017, 'New York Times' reporter Jodi Kanto... |
| 2 | the menu | 2022 | Foodie Tyler Ledford and his date, Margot Mil... |
| 3 | 80 for brady | 2023 | In 2017, best friends Lou, Trish, Maura, and ... |
| 4 | the last kingdom: seven kings must die | 2023 | Olaf Guthfrithson Anlaf, a Viking king from I... |
In [4]:
# Connect to Redis, make sure Redis is running
client = redis.Redis(host='localhost', port=6379, decode_responses=True)
print(client.ping())
# Check redis version
print(f"Redis version: {redis.__version__}")
# Check if Redisearch module is loaded
modules = client.execute_command("MODULE", "LIST")
print(modules)
# Check if index exists and delete if it does
try:
client.ft("movie_idx").dropindex(delete_documents=False)
print("Dropped existing index")
except:
print("Index didn't exist yet")
True Redis version: 5.0.8 [['name', 'search', 'ver', 21020, 'path', '/opt/redis-stack/lib/redisearch.so', 'args', []]] Dropped existing index
In [5]:
# Load the embeddings and filtered DataFrame to Redis
# Ensure the embeddings match the DataFrame length
if embeddings.shape[0] != filtered_df_2020.shape[0]:
raise ValueError("Embeddings and DataFrame lengths do not match!")
# Define schema with correct dimension
schema = (
TextField("title"),
NumericField("year"),
VectorField("plot_embedding", "FLAT", {
"TYPE": "FLOAT32",
"DIM": 1024, # jina-embeddings-v3 dimension
"DISTANCE_METRIC": "COSINE"
})
)
# Create index
index_def = IndexDefinition(prefix=["movie:"], index_type=IndexType.HASH)
client.ft("movie_idx").create_index(schema, definition=index_def)
print("Created new index")
# Store data
for i, row in filtered_df_2020.iterrows():
key = f"movie:{i}"
client.hset(key, mapping={
"title": row["title"],
"year": row["year"],
"plot_embedding": embeddings[i].astype(np.float32).tobytes()
})
print(f"Stored {len(filtered_df_2020)} movies in Redis")
# Verify storage
info = client.ft("movie_idx").info()
print(f"Index contains {info['num_docs']} documents")
print("Redisearch index info:", info)
Created new index
Stored 1115 movies in Redis
Index contains 1115 documents
Redisearch index info: {'index_name': 'movie_idx', 'index_options': [], 'index_definition': ['key_type', 'HASH', 'prefixes', ['movie:'], 'default_score', '1'], 'attributes': [['identifier', 'title', 'attribute', 'title', 'type', 'TEXT', 'WEIGHT', '1'], ['identifier', 'year', 'attribute', 'year', 'type', 'NUMERIC'], ['identifier', 'plot_embedding', 'attribute', 'plot_embedding', 'type', 'VECTOR', 'algorithm', 'FLAT', 'data_type', 'FLOAT32', 'dim', 1024, 'distance_metric', 'COSINE']], 'num_docs': 1115, 'max_doc_id': 2230, 'num_terms': 2059, 'num_records': 10434, 'inverted_sz_mb': '0.23740291595458984', 'vector_index_sz_mb': '8.066841125488281', 'total_inverted_index_blocks': 2083, 'offset_vectors_sz_mb': '0.005733489990234375', 'doc_table_size_mb': '0.11417961120605469', 'sortable_values_size_mb': '0', 'key_table_size_mb': '0.033768653869628906', 'tag_overhead_sz_mb': '0', 'text_overhead_sz_mb': '0.0687265396118164', 'total_index_memory_sz_mb': '0.47181129455566406', 'geoshapes_sz_mb': '0', 'records_per_doc_avg': '9.357847213745117', 'bytes_per_record_avg': '23.858060836791992', 'offsets_per_term_avg': '0.5761932134628296', 'offset_bits_per_record_avg': '8', 'hash_indexing_failures': 0, 'total_indexing_time': '51.659000396728516', 'indexing': 0, 'percent_indexed': '1', 'number_of_uses': 1, 'cleaning': 0, 'gc_stats': ['bytes_collected', '0', 'total_ms_run', '0', 'total_cycles', '0', 'average_cycle_time_ms', 'nan', 'last_run_time_ms', '0', 'gc_numeric_trees_missed', '0', 'gc_blocks_denied', '0'], 'cursor_stats': ['global_idle', 0, 'global_total', 0, 'index_capacity', 128, 'index_total', 0], 'dialect_stats': ['dialect_1', 0, 'dialect_2', 0, 'dialect_3', 0, 'dialect_4', 0], 'Index Errors': ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A', 'background indexing status', 'OK'], 'field statistics': [['identifier', 'title', 'attribute', 'title', 'Index Errors', ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A']], ['identifier', 'year', 'attribute', 'year', 'Index Errors', ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A']], ['identifier', 'plot_embedding', 'attribute', 'plot_embedding', 'Index Errors', ['indexing failures', 0, 'last indexing error', 'N/A', 'last indexing error key', 'N/A']]]}
In [ ]:
# Load the model (same as used for embeddings)
model = SentenceTransformer(
'jinaai/jina-embeddings-v3',
device=device,
trust_remote_code=True
)
In [7]:
# Function to find similar movies based on a query text
def find_similar_movies(query_text: str, k: int = 5, exclude_title: str = None):
"""
Find top K similar movies based on a query text.
Args:
query_text: Plot description or keywords to search for.
k: Number of results to return.
exclude_title: Optional title to exclude (e.g., if querying with an existing movie).
Returns:
List of dicts with title, year, and similarity score.
"""
# Encode the query
query_embedding = model.encode([query_text], normalize_embeddings=True)[0].astype(np.float32).tobytes()
# Build the query: KNN on plot_embedding, sort by score (lower is more similar for COSINE)
base_query = f"*=>[KNN {k} @plot_embedding $vec AS vector_score]"
query = (
Query(base_query)
.sort_by("vector_score", asc=True) # Ascending for COSINE (0 = identical)
.return_fields("title", "year", "vector_score")
.paging(0, k)
.dialect(2)
)
# Execute search
results = client.ft("movie_idx").search(query, {"vec": query_embedding})
# Process results
similar_movies = []
for doc in results.docs:
title = doc.title
if exclude_title and title.lower() == exclude_title.lower():
continue # Skip if excluding
similar_movies.append({
"title": title,
"year": int(doc.year),
"similarity_score": 1 - float(doc.vector_score) # Convert to similarity (1 = identical)
})
return similar_movies
In [8]:
# Embedding text before querying
query_text = "zombie apocalypse in a city, escape from hordes of undead"
similar = find_similar_movies(query_text, k=5)
print(f"Top similar movies: with query '{query_text}':")
pd.DataFrame(similar)
Top similar movies: with query 'zombie apocalypse in a city, escape from hordes of undead':
Out[8]:
| title | year | similarity_score | |
|---|---|---|---|
| 0 | army of the dead | 2021 | 0.690218 |
| 1 | peninsula | 2020 | 0.643609 |
| 2 | unhuman | 2022 | 0.642859 |
| 3 | escape room: tournament of champions | 2021 | 0.618451 |
| 4 | sky sharks | 2020 | 0.618377 |
In [9]:
# Pick a movie from your DataFrame (e.g., index 0)
movie_idx = 716
# Print the title to exclude
search_title = filtered_df_2020.loc[movie_idx, 'title']
query_text = filtered_df_2020.loc[movie_idx, 'plot']
print(f"Querying similar movies for a James Bond movie: {search_title}")
# k=6 to account for excluding one; exclude the movie itself
similar = find_similar_movies(query_text, k=6, exclude_title=search_title)
pd.DataFrame(similar)
Querying similar movies for a James Bond movie: no time to die
Out[9]:
| title | year | similarity_score | |
|---|---|---|---|
| 0 | trigger point | 2021 | 0.691921 |
| 1 | sas: red notice | 2021 | 0.686741 |
| 2 | the rhythm section | 2020 | 0.653405 |
| 3 | gunpowder milkshake | 2021 | 0.646881 |
| 4 | the postcard killings | 2020 | 0.644570 |
In [10]:
# Visualize the embeddings using t-SNE
print("Visualizing movie embeddings using t-SNE...")
# Cluster embeddings into 5 groups
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(embeddings)
# Add clusters to DataFrame for reference
filtered_df_2020['cluster'] = clusters
# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
tsne_emb = tsne.fit_transform(embeddings)
# Create scatter plot with Plotly, using discrete colors for clusters and custom hover text
fig = px.scatter(
x=tsne_emb[:, 0],
y=tsne_emb[:, 1],
color=filtered_df_2020['cluster'].astype(str), # Convert to string for discrete colors
hover_data={'title': filtered_df_2020['title']}, # Custom hover to show only title
labels={'color': 'Cluster'}, # Label for the legend
title="t-SNE Visualization of Movie Embeddings Colored by Cluster"
)
# Update layout to improve readability and set plot size to 8x6 inches
fig.update_traces(
hovertemplate="title: %{customdata[0]}<extra></extra>" # Custom hover template to show only title
)
fig.update_layout(
showlegend=True,
coloraxis_colorbar_title="Cluster",
width=800, # 8 inches at 100 pixels per inch
height=600 # 6 inches at 100 pixels per inch
)
print("Plotting t-SNE visualization of movie embeddings colored by cluster")
print("This is a interactive plot, hover over points to see movie titles.")
# Save a html file for interactive viewing, this can be downloaded and viewed in a browser
fig.write_html("movie_embeddings_tsne.html")
# This won't be showing on the git
fig.show()
Visualizing movie embeddings using t-SNE... Plotting t-SNE visualization of movie embeddings colored by cluster This is a interactive plot, hover over points to see movie titles.
In [11]:
# Also show a static plot for quick reference
fig_static = px.scatter(
x=tsne_emb[:, 0],
y=tsne_emb[:, 1],
color=filtered_df_2020['cluster'].astype(str),
hover_data={'title': filtered_df_2020['title']},
labels={'color': 'Cluster'},
title="t-SNE Visualization of Movie Embeddings Colored by Cluster"
)
fig_static.update_traces(
hovertemplate="title: %{customdata[0]}<extra></extra>" # Custom hover template to show only title
)
fig_static.update_layout(
showlegend=True,
coloraxis_colorbar_title="Cluster",
width=800, # 8 inches at 100 pixels per inch
height=600 # 6 inches at 100 pixels per inch
)
In [ ]: