In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
# Load if not in memory (from your notebook)
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")
embeddings = np.load("../../data/processed/movie_embeddings.npy")
assert len(filtered_df_2020) == embeddings.shape[0], "Mismatch in data sizes"
print(f"Ready: {embeddings.shape[0]} vectors of dim {embeddings.shape[1]}")
Ready: 1115 vectors of dim 1024
In [ ]:
%%time
# Compute full similarity matrix (symmetric, values in [-1,1] but ~[0,1] for normalized vectors)
sim_matrix = cosine_similarity(embeddings)
# Mask diagonal (self-sim=1) if not needed
# This way we can find the most similar pairs that is not self-similar
np.fill_diagonal(sim_matrix, 0)
print(f"Similarity matrix shape: {sim_matrix.shape}")
print(f"Sample sim (movie 0 vs 1): {sim_matrix[0,1]:.3f}")
# Save for later use
np.save("../../data/processed/movie_sim_matrix.npy", sim_matrix)
Similarity matrix shape: (1115, 1115) Sample sim (movie 0 vs 1): 0.492 CPU times: user 29.2 ms, sys: 6.89 ms, total: 36.1 ms Wall time: 23.2 ms
In [20]:
# Extract the 20x20 submatrix
sub_matrix = sim_matrix[:20, :20].copy()
# Set diagonal to NaN to ignore self-similarities
np.fill_diagonal(sub_matrix, np.nan)
# Find most similar pair
max_val = np.nanmax(sub_matrix)
max_i, max_j = np.unravel_index(np.nanargmax(sub_matrix), sub_matrix.shape)
title_max1 = filtered_df_2020['title'].iloc[max_i]
title_max2 = filtered_df_2020['title'].iloc[max_j]
# Find most similar pair
max_i, max_j = np.unravel_index(np.nanargmax(sub_matrix), sub_matrix.shape)
title_max1 = filtered_df_2020['title'].iloc[max_i]
title_max2 = filtered_df_2020['title'].iloc[max_j]
print(f"Most similar pair: '{title_max1}' and '{title_max2}' with similarity {max_val:.3f}")
# Find most dissimilar pair
min_val = np.nanmin(sub_matrix)
min_i, min_j = np.unravel_index(np.nanargmin(sub_matrix), sub_matrix.shape)
title_min1 = filtered_df_2020['title'].iloc[min_i]
title_min2 = filtered_df_2020['title'].iloc[min_j]
print(f"Most dissimilar pair: '{title_min1}' and '{title_min2}' with similarity {min_val:.3f}")
# Check out the pairs at the upper left corner
plt.figure(figsize=(12, 10))
sns.heatmap(sim_matrix[:20, :20],
xticklabels=filtered_df_2020['title'][:20],
yticklabels=filtered_df_2020['title'][:20],
cmap='coolwarm',)
plt.title("Cosine Similarity Heatmap of Movie Plot Embeddings (upper left corner), Self-similarity masked as 0")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
Most similar pair: 'the menu' and 'somebody i used to know' with similarity 0.656 Most dissimilar pair: 'she said' and 'the last kingdom: seven kings must die' with similarity 0.245
In [ ]:
%%time
# Basic stats
print(f"Avg similarity: {np.mean(sim_matrix):.3f}")
print(f"Max non-self sim: {np.max(sim_matrix):.3f}")
# Threshold and extract top pairs, if it's too dissimilar, we won't keep them
threshold = 0.5
pairs = []
for i in range(sim_matrix.shape[0]):
for j in range(i+1, sim_matrix.shape[0]): # Upper triangle
if sim_matrix[i,j] > threshold:
pairs.append({
'movie1_idx': i, 'movie1_title': filtered_df_2020.iloc[i]['title'],
'movie2_idx': j, 'movie2_title': filtered_df_2020.iloc[j]['title'],
'similarity': sim_matrix[i,j]
})
pairs_df = pd.DataFrame(pairs).sort_values('similarity', ascending=False)
print(f"Number of pairs above {threshold}: {len(pairs_df)}")
pairs_df.head(10) # Inspect top similar movies
# Save for DB import
pairs_df.to_csv("../../data/processed/movie_sim_pairs.csv", index=False)
Avg similarity: 0.513 Max non-self sim: 0.919 Number of pairs above 0.5: 361506
In [28]:
# Check out our sorted similarity pairs
print(f"Top 15 pairs with similarity above {threshold}:")
pairs_df.head(15)
Top 15 pairs with similarity above 0.5:
Out[28]:
| movie1_idx | movie1_title | movie2_idx | movie2_title | similarity | |
|---|---|---|---|---|---|
| 112032 | 184 | after we collided | 737 | after we fell | 0.919264 |
| 128629 | 215 | the kissing booth 2 | 555 | the kissing booth 3 | 0.911171 |
| 36438 | 61 | mortal kombat legends: scorpion's revenge | 742 | mortal kombat | 0.903275 |
| 361487 | 1106 | fear street: part one - 1994 | 1109 | fear street: part three - 1666 | 0.873837 |
| 168410 | 293 | enola holmes | 1017 | enola holmes 2 | 0.852084 |
| 315705 | 708 | halloween kills | 1089 | halloween ends | 0.850219 |
| 114922 | 189 | to all the boys: p.s. i still love you | 649 | to all the boys: always and forever | 0.838510 |
| 359174 | 1018 | pinocchio | 1025 | guillermo del toro's pinocchio | 0.827098 |
| 361493 | 1107 | fear street: part two - 1978 | 1109 | fear street: part three - 1666 | 0.816748 |
| 361485 | 1106 | fear street: part one - 1994 | 1107 | fear street: part two - 1978 | 0.811916 |
| 215633 | 397 | peninsula | 1005 | carter | 0.808633 |
| 195559 | 355 | fatal affair | 738 | till death | 0.802363 |
| 70343 | 112 | horse girl | 850 | dual | 0.799508 |
| 323849 | 746 | slumber party massacre | 854 | no exit | 0.797028 |
| 72455 | 116 | dangerous lies | 738 | till death | 0.795985 |
In [29]:
print("Example: Most similar to 'Mortal Kombat Legends: Scorpion's Revenge'")
pairs_df[pairs_df['movie1_title'] == "mortal kombat legends: scorpion's revenge"].sort_values('similarity', ascending=False).head(10)
Example: Most similar to 'Mortal Kombat Legends: Scorpion's Revenge'
Out[29]:
| movie1_idx | movie1_title | movie2_idx | movie2_title | similarity | |
|---|---|---|---|---|---|
| 36438 | 61 | mortal kombat legends: scorpion's revenge | 742 | mortal kombat | 0.903275 |
| 36392 | 61 | mortal kombat legends: scorpion's revenge | 500 | snake eyes | 0.663003 |
| 36397 | 61 | mortal kombat legends: scorpion's revenge | 528 | godzilla vs. kong | 0.641427 |
| 36365 | 61 | mortal kombat legends: scorpion's revenge | 351 | jiu jitsu | 0.632985 |
| 36451 | 61 | mortal kombat legends: scorpion's revenge | 821 | fistful of vengeance | 0.618561 |
| 36483 | 61 | mortal kombat legends: scorpion's revenge | 1112 | shang-chi and the legend of the ten rings | 0.616623 |
| 36484 | 61 | mortal kombat legends: scorpion's revenge | 1113 | the suicide squad | 0.615829 |
| 36405 | 61 | mortal kombat legends: scorpion's revenge | 557 | born a champion | 0.610690 |
| 36400 | 61 | mortal kombat legends: scorpion's revenge | 545 | prisoners of the ghostland | 0.605057 |
| 36360 | 61 | mortal kombat legends: scorpion's revenge | 324 | pokémon the movie: secrets of the jungle | 0.599053 |
In [ ]: