In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

# Load if not in memory (from your notebook)
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")
embeddings = np.load("../../data/processed/movie_embeddings.npy")
assert len(filtered_df_2020) == embeddings.shape[0], "Mismatch in data sizes"

print(f"Ready: {embeddings.shape[0]} vectors of dim {embeddings.shape[1]}")
Ready: 1115 vectors of dim 1024
In [ ]:
%%time
# Compute full similarity matrix (symmetric, values in [-1,1] but ~[0,1] for normalized vectors)
sim_matrix = cosine_similarity(embeddings)

# Mask diagonal (self-sim=1) if not needed
# This way we can find the most similar pairs that is not self-similar
np.fill_diagonal(sim_matrix, 0)

print(f"Similarity matrix shape: {sim_matrix.shape}")
print(f"Sample sim (movie 0 vs 1): {sim_matrix[0,1]:.3f}")

# Save for later use
np.save("../../data/processed/movie_sim_matrix.npy", sim_matrix)
Similarity matrix shape: (1115, 1115)
Sample sim (movie 0 vs 1): 0.492
CPU times: user 29.2 ms, sys: 6.89 ms, total: 36.1 ms
Wall time: 23.2 ms
In [20]:
# Extract the 20x20 submatrix
sub_matrix = sim_matrix[:20, :20].copy()

# Set diagonal to NaN to ignore self-similarities
np.fill_diagonal(sub_matrix, np.nan)

# Find most similar pair
max_val = np.nanmax(sub_matrix)
max_i, max_j = np.unravel_index(np.nanargmax(sub_matrix), sub_matrix.shape)
title_max1 = filtered_df_2020['title'].iloc[max_i]
title_max2 = filtered_df_2020['title'].iloc[max_j]

# Find most similar pair
max_i, max_j = np.unravel_index(np.nanargmax(sub_matrix), sub_matrix.shape)
title_max1 = filtered_df_2020['title'].iloc[max_i]
title_max2 = filtered_df_2020['title'].iloc[max_j]

print(f"Most similar pair: '{title_max1}' and '{title_max2}' with similarity {max_val:.3f}")

# Find most dissimilar pair
min_val = np.nanmin(sub_matrix)
min_i, min_j = np.unravel_index(np.nanargmin(sub_matrix), sub_matrix.shape)
title_min1 = filtered_df_2020['title'].iloc[min_i]
title_min2 = filtered_df_2020['title'].iloc[min_j]

print(f"Most dissimilar pair: '{title_min1}' and '{title_min2}' with similarity {min_val:.3f}")
                                                                                 
# Check out the pairs at the upper left corner
plt.figure(figsize=(12, 10))
sns.heatmap(sim_matrix[:20, :20], 
            xticklabels=filtered_df_2020['title'][:20], 
            yticklabels=filtered_df_2020['title'][:20],
            cmap='coolwarm',)
plt.title("Cosine Similarity Heatmap of Movie Plot Embeddings (upper left corner), Self-similarity masked as 0")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
Most similar pair: 'the menu' and 'somebody i used to know' with similarity 0.656
Most dissimilar pair: 'she said' and 'the last kingdom: seven kings must die' with similarity 0.245
No description has been provided for this image
In [ ]:
%%time

# Basic stats
print(f"Avg similarity: {np.mean(sim_matrix):.3f}")
print(f"Max non-self sim: {np.max(sim_matrix):.3f}")

# Threshold and extract top pairs, if it's too dissimilar, we won't keep them
threshold = 0.5
pairs = []
for i in range(sim_matrix.shape[0]):
    for j in range(i+1, sim_matrix.shape[0]):  # Upper triangle
        if sim_matrix[i,j] > threshold:
            pairs.append({
                'movie1_idx': i, 'movie1_title': filtered_df_2020.iloc[i]['title'],
                'movie2_idx': j, 'movie2_title': filtered_df_2020.iloc[j]['title'],
                'similarity': sim_matrix[i,j]
            })

pairs_df = pd.DataFrame(pairs).sort_values('similarity', ascending=False)
print(f"Number of pairs above {threshold}: {len(pairs_df)}")
pairs_df.head(10)  # Inspect top similar movies

# Save for DB import
pairs_df.to_csv("../../data/processed/movie_sim_pairs.csv", index=False)
Avg similarity: 0.513
Max non-self sim: 0.919
Number of pairs above 0.5: 361506
In [28]:
# Check out our sorted similarity pairs
print(f"Top 15 pairs with similarity above {threshold}:")
pairs_df.head(15)
Top 15 pairs with similarity above 0.5:
Out[28]:
movie1_idx movie1_title movie2_idx movie2_title similarity
112032 184 after we collided 737 after we fell 0.919264
128629 215 the kissing booth 2 555 the kissing booth 3 0.911171
36438 61 mortal kombat legends: scorpion's revenge 742 mortal kombat 0.903275
361487 1106 fear street: part one - 1994 1109 fear street: part three - 1666 0.873837
168410 293 enola holmes 1017 enola holmes 2 0.852084
315705 708 halloween kills 1089 halloween ends 0.850219
114922 189 to all the boys: p.s. i still love you 649 to all the boys: always and forever 0.838510
359174 1018 pinocchio 1025 guillermo del toro's pinocchio 0.827098
361493 1107 fear street: part two - 1978 1109 fear street: part three - 1666 0.816748
361485 1106 fear street: part one - 1994 1107 fear street: part two - 1978 0.811916
215633 397 peninsula 1005 carter 0.808633
195559 355 fatal affair 738 till death 0.802363
70343 112 horse girl 850 dual 0.799508
323849 746 slumber party massacre 854 no exit 0.797028
72455 116 dangerous lies 738 till death 0.795985
In [29]:
print("Example: Most similar to 'Mortal Kombat Legends: Scorpion's Revenge'")
pairs_df[pairs_df['movie1_title'] == "mortal kombat legends: scorpion's revenge"].sort_values('similarity', ascending=False).head(10)
Example: Most similar to 'Mortal Kombat Legends: Scorpion's Revenge'
Out[29]:
movie1_idx movie1_title movie2_idx movie2_title similarity
36438 61 mortal kombat legends: scorpion's revenge 742 mortal kombat 0.903275
36392 61 mortal kombat legends: scorpion's revenge 500 snake eyes 0.663003
36397 61 mortal kombat legends: scorpion's revenge 528 godzilla vs. kong 0.641427
36365 61 mortal kombat legends: scorpion's revenge 351 jiu jitsu 0.632985
36451 61 mortal kombat legends: scorpion's revenge 821 fistful of vengeance 0.618561
36483 61 mortal kombat legends: scorpion's revenge 1112 shang-chi and the legend of the ten rings 0.616623
36484 61 mortal kombat legends: scorpion's revenge 1113 the suicide squad 0.615829
36405 61 mortal kombat legends: scorpion's revenge 557 born a champion 0.610690
36400 61 mortal kombat legends: scorpion's revenge 545 prisoners of the ghostland 0.605057
36360 61 mortal kombat legends: scorpion's revenge 324 pokémon the movie: secrets of the jungle 0.599053
In [ ]: