import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

# Load if not in memory (from your notebook)
filtered_df_2020 = pd.read_csv("../../data/processed/movie_plots_filtered_2020.csv")
embeddings = np.load("../../data/processed/movie_embeddings.npy")
assert len(filtered_df_2020) == embeddings.shape[0], "Mismatch in data sizes"

print(f"Ready: {embeddings.shape[0]} vectors of dim {embeddings.shape[1]}")

Ready: 1115 vectors of dim 1024

%%time
# Compute full similarity matrix (symmetric, values in [-1,1] but ~[0,1] for normalized vectors)
sim_matrix = cosine_similarity(embeddings)

# Mask diagonal (self-sim=1) if not needed
# This way we can find the most similar pairs that is not self-similar
np.fill_diagonal(sim_matrix, 0)

print(f"Similarity matrix shape: {sim_matrix.shape}")
print(f"Sample sim (movie 0 vs 1): {sim_matrix[0,1]:.3f}")

# Save for later use
np.save("../../data/processed/movie_sim_matrix.npy", sim_matrix)

Similarity matrix shape: (1115, 1115)
Sample sim (movie 0 vs 1): 0.492
CPU times: user 29.2 ms, sys: 6.89 ms, total: 36.1 ms
Wall time: 23.2 ms

# Extract the 20x20 submatrix
sub_matrix = sim_matrix[:20, :20].copy()

# Set diagonal to NaN to ignore self-similarities
np.fill_diagonal(sub_matrix, np.nan)

# Find most similar pair
max_val = np.nanmax(sub_matrix)
max_i, max_j = np.unravel_index(np.nanargmax(sub_matrix), sub_matrix.shape)
title_max1 = filtered_df_2020['title'].iloc[max_i]
title_max2 = filtered_df_2020['title'].iloc[max_j]

# Find most similar pair
max_i, max_j = np.unravel_index(np.nanargmax(sub_matrix), sub_matrix.shape)
title_max1 = filtered_df_2020['title'].iloc[max_i]
title_max2 = filtered_df_2020['title'].iloc[max_j]

print(f"Most similar pair: '{title_max1}' and '{title_max2}' with similarity {max_val:.3f}")

# Find most dissimilar pair
min_val = np.nanmin(sub_matrix)
min_i, min_j = np.unravel_index(np.nanargmin(sub_matrix), sub_matrix.shape)
title_min1 = filtered_df_2020['title'].iloc[min_i]
title_min2 = filtered_df_2020['title'].iloc[min_j]

print(f"Most dissimilar pair: '{title_min1}' and '{title_min2}' with similarity {min_val:.3f}")
                                                                                 
# Check out the pairs at the upper left corner
plt.figure(figsize=(12, 10))
sns.heatmap(sim_matrix[:20, :20], 
            xticklabels=filtered_df_2020['title'][:20], 
            yticklabels=filtered_df_2020['title'][:20],
            cmap='coolwarm',)
plt.title("Cosine Similarity Heatmap of Movie Plot Embeddings (upper left corner), Self-similarity masked as 0")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Most similar pair: 'the menu' and 'somebody i used to know' with similarity 0.656
Most dissimilar pair: 'she said' and 'the last kingdom: seven kings must die' with similarity 0.245

%%time

# Basic stats
print(f"Avg similarity: {np.mean(sim_matrix):.3f}")
print(f"Max non-self sim: {np.max(sim_matrix):.3f}")

# Threshold and extract top pairs, if it's too dissimilar, we won't keep them
threshold = 0.5
pairs = []
for i in range(sim_matrix.shape[0]):
    for j in range(i+1, sim_matrix.shape[0]):  # Upper triangle
        if sim_matrix[i,j] > threshold:
            pairs.append({
                'movie1_idx': i, 'movie1_title': filtered_df_2020.iloc[i]['title'],
                'movie2_idx': j, 'movie2_title': filtered_df_2020.iloc[j]['title'],
                'similarity': sim_matrix[i,j]
            })

pairs_df = pd.DataFrame(pairs).sort_values('similarity', ascending=False)
print(f"Number of pairs above {threshold}: {len(pairs_df)}")
pairs_df.head(10)  # Inspect top similar movies

# Save for DB import
pairs_df.to_csv("../../data/processed/movie_sim_pairs.csv", index=False)

Avg similarity: 0.513
Max non-self sim: 0.919
Number of pairs above 0.5: 361506

# Check out our sorted similarity pairs
print(f"Top 15 pairs with similarity above {threshold}:")
pairs_df.head(15)

Top 15 pairs with similarity above 0.5:

print("Example: Most similar to 'Mortal Kombat Legends: Scorpion's Revenge'")
pairs_df[pairs_df['movie1_title'] == "mortal kombat legends: scorpion's revenge"].sort_values('similarity', ascending=False).head(10)

Example: Most similar to 'Mortal Kombat Legends: Scorpion's Revenge'

	movie1_idx	movie1_title	movie2_idx	movie2_title	similarity
112032	184	after we collided	737	after we fell	0.919264
128629	215	the kissing booth 2	555	the kissing booth 3	0.911171
36438	61	mortal kombat legends: scorpion's revenge	742	mortal kombat	0.903275
361487	1106	fear street: part one - 1994	1109	fear street: part three - 1666	0.873837
168410	293	enola holmes	1017	enola holmes 2	0.852084
315705	708	halloween kills	1089	halloween ends	0.850219
114922	189	to all the boys: p.s. i still love you	649	to all the boys: always and forever	0.838510
359174	1018	pinocchio	1025	guillermo del toro's pinocchio	0.827098
361493	1107	fear street: part two - 1978	1109	fear street: part three - 1666	0.816748
361485	1106	fear street: part one - 1994	1107	fear street: part two - 1978	0.811916
215633	397	peninsula	1005	carter	0.808633
195559	355	fatal affair	738	till death	0.802363
70343	112	horse girl	850	dual	0.799508
323849	746	slumber party massacre	854	no exit	0.797028
72455	116	dangerous lies	738	till death	0.795985

	movie1_idx	movie1_title	movie2_idx	movie2_title	similarity
36438	61	mortal kombat legends: scorpion's revenge	742	mortal kombat	0.903275
36392	61	mortal kombat legends: scorpion's revenge	500	snake eyes	0.663003
36397	61	mortal kombat legends: scorpion's revenge	528	godzilla vs. kong	0.641427
36365	61	mortal kombat legends: scorpion's revenge	351	jiu jitsu	0.632985
36451	61	mortal kombat legends: scorpion's revenge	821	fistful of vengeance	0.618561
36483	61	mortal kombat legends: scorpion's revenge	1112	shang-chi and the legend of the ten rings	0.616623
36484	61	mortal kombat legends: scorpion's revenge	1113	the suicide squad	0.615829
36405	61	mortal kombat legends: scorpion's revenge	557	born a champion	0.610690
36400	61	mortal kombat legends: scorpion's revenge	545	prisoners of the ghostland	0.605057
36360	61	mortal kombat legends: scorpion's revenge	324	pokémon the movie: secrets of the jungle	0.599053