In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import json
import re
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Load the data
df = pd.read_csv("../../data/raw/movie_plots.csv")
# Basic dataset info
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()
Dataset Shape: (21689, 11) Column Names: ['title', 'stars', 'directors', 'year', 'genre', 'runtime', 'ratingCount', 'plot', 'summary', 'imdb_rating', 'source'] Data Types: title object stars object directors object year int64 genre object runtime int64 ratingCount int64 plot object summary object imdb_rating float64 source object dtype: object First few rows:
Out[1]:
| title | stars | directors | year | genre | runtime | ratingCount | plot | summary | imdb_rating | source | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | repulsion | [catherine deneuve, ian hendry, john fraser, y... | [roman polanski] | 1965 | [drama, horror, thriller] | 105 | 54715 | Carol Ledoux, a beautiful and shy Belgian Man... | {{Short description,1965 film by Roman Polansk... | 7.6 | tt0059646 |
| 1 | the underworld story | [dan duryea, herbert marshall, gale storm, how... | [cy endfield] | 1950 | [crime, drama, film-noir] | 91 | 1276 | When big-city newspaper reporter Mike Reese (... | {{Short description,1950 film by Cy Endfield}}... | 7.0 | tt0043088 |
| 2 | the undertaker and his pals | [warrene ott, james westmoreland, marty friedm... | [t.l.p. swicegood] | 1966 | [comedy, horror] | 63 | 1186 | A motorcycle-riding undertaker and his two bi... | {{Short description,1966 American horror film ... | 4.5 | tt0061140 |
| 3 | this property is condemned | [natalie wood, robert redford, charles bronson... | [sydney pollack] | 1966 | [drama, romance] | 110 | 6034 | The film starts with a frame story in which a... | {{Short description,1966 film by Sydney Pollac... | 7.0 | tt0061089 |
| 4 | the glass bottom boat | [doris day, rod taylor, arthur godfrey, john m... | [frank tashlin] | 1966 | [comedy, romance] | 110 | 4908 | Axel Nordstrom manages a glass-bottom boat to... | {{Short description,1966 American romantic com... | 6.4 | tt0060463 |
In [2]:
# Check for missing values and data quality
print("Missing Values:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")
# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")
# Basic statistics for numerical columns
print("\nNumerical Columns Statistics:")
numerical_cols = ['year', 'runtime', 'ratingCount', 'imdb_rating']
print(df[numerical_cols].describe())
# Check unique values in categorical columns
print("\nUnique Values per Column:")
for col in ['title', 'directors', 'genre', 'source']:
if col in df.columns:
print(f"{col}: {df[col].nunique()} unique values")
Missing Values:
title 0
stars 0
directors 0
year 0
genre 0
runtime 0
ratingCount 0
plot 2337
summary 0
imdb_rating 0
source 0
dtype: int64
Total missing values: 2337
Duplicate rows: 0
Numerical Columns Statistics:
year runtime ratingCount imdb_rating
count 21689.000000 21689.00000 2.168900e+04 21689.000000
mean 1999.117433 102.28609 4.076619e+04 6.011610
std 18.340120 18.68148 1.199447e+05 1.125659
min 1950.000000 -1.00000 1.001000e+03 1.200000
25% 1988.000000 90.00000 2.161000e+03 5.400000
50% 2005.000000 99.00000 5.687000e+03 6.200000
75% 2014.000000 110.00000 2.451400e+04 6.800000
max 2023.000000 776.00000 2.727886e+06 9.300000
Unique Values per Column:
title: 20617 unique values
directors: 9863 unique values
genre: 664 unique values
source: 21674 unique values
In [3]:
# Vector database preparation insights - FIXED EXECUTION ORDER
print("=== VECTOR DATABASE PREPARATION INSIGHTS ===\n")
# ENSURE COMPUTED COLUMNS EXIST
if 'plot_length' not in df.columns:
print("Creating computed columns...")
df['plot_length'] = df['plot'].astype(str).apply(len)
df['plot_word_count'] = df['plot'].astype(str).str.split().apply(len)
df['summary_length'] = df['summary'].astype(str).apply(len)
# Text quality assessment with ACTUAL numbers
print("=== TEXT QUALITY ASSESSMENT ===")
print(f"Total movies: {len(df)}")
print(f"Movies with missing plots: {df['plot'].isnull().sum()}")
print(f"Movies with empty plots: {(df['plot'].astype(str).str.strip() == '').sum()}")
print(f"Movies with very short plots (<100 chars): {(df['plot_length'] < 100).sum()}")
print(f"Movies with very long plots (>5000 chars): {(df['plot_length'] > 5000).sum()}")
# Language detection (basic)
def contains_english(text):
# Basic check for English characters
english_chars = len(re.findall(r'[a-zA-Z]', str(text)))
total_chars = len(re.findall(r'\w', str(text)))
return english_chars / max(total_chars, 1) > 0.8
# Ensure language detection column exists
if 'is_english' not in df.columns:
df['is_english'] = df['plot'].apply(contains_english)
print(f"\nMovies with primarily English text: {df['is_english'].sum()}")
print(f"Movies with non-English text: {(~df['is_english']).sum()}")
# Unique content analysis
unique_plots = df['plot'].nunique()
print(f"\nUnique plot summaries: {unique_plots}")
print(f"Duplicate plots: {len(df) - unique_plots}")
# Year-based filtering analysis
print("\n=== YEAR-BASED FILTERING ANALYSIS ===")
year_ranges = {
'1950-1970': len(df[(df['year'] >= 1950) & (df['year'] <= 1970)]),
'1971-1990': len(df[(df['year'] >= 1971) & (df['year'] <= 1990)]),
'1991-2010': len(df[(df['year'] >= 1991) & (df['year'] <= 2010)]),
'2011-2023': len(df[(df['year'] >= 2011) & (df['year'] <= 2023)])
}
for year_range, count in year_ranges.items():
percentage = count / len(df) * 100
print(f"{year_range}: {count} movies ({percentage:.1f}%)")
# Summary statistics for vector database
print("\n=== VECTOR DATABASE RECOMMENDATIONS ===")
print("1. Text Preprocessing:")
print(f" - Average plot length: {df['plot_length'].mean():.0f} characters")
print(f" - Median plot length: {df['plot_length'].median():.0f} characters")
print(f" - Optimal chunk size: ~500-1000 characters")
print(f" - Consider truncating plots >3000 characters")
print("\n2. Quality Filtering Strategy:")
print(" - Remove movies with missing plots")
print(" - Remove movies with empty plots")
print(" - Remove movies with very short plots (<100 chars)")
print(" - Remove non-English plots")
print(" - Consider removing movies with ratingCount < 100")
print("\n3. Year-Based Filtering Options:")
print(" - Recent focus (2010-2023): High volume, modern topics")
print(" - Balanced approach (1991-2023): Good coverage, manageable size")
print(" - Full dataset (1950-2023): Complete but larger")
# Create filtered dataset for vector database
filtered_df = df[
(df['plot'].notna()) &
(df['plot'].astype(str).str.strip() != '') &
(df['is_english']) &
(df['ratingCount'] >= 100) &
(df['plot_length'] >= 100)
].copy()
print(f"\n=== FINAL FILTERED DATASET ===")
print(f"Original dataset: {len(df)} movies")
print(f"Filtered dataset: {len(filtered_df)} movies")
print(f"Data reduction: {len(df) - len(filtered_df)} movies ({(len(df) - len(filtered_df))/len(df)*100:.1f}%)")
# Save filtered dataset
import os
os.makedirs("../../data/processed", exist_ok=True)
filtered_df.to_csv("../../data/processed/movie_plots_filtered.csv", index=False)
print(f"\nFiltered dataset saved to: ../../data/processed/movie_plots_filtered.csv")
# Display sample of filtered data
print("\n=== FILTERED DATA SAMPLE ===")
print(f"Year range in filtered data: {filtered_df['year'].min()} - {filtered_df['year'].max()}")
print(f"Average plot length in filtered data: {filtered_df['plot_length'].mean():.0f} characters")
print(f"Average word count in filtered data: {filtered_df['plot_word_count'].mean():.0f} words")
=== VECTOR DATABASE PREPARATION INSIGHTS === Creating computed columns... === TEXT QUALITY ASSESSMENT === Total movies: 21689 Movies with missing plots: 2337 Movies with empty plots: 133 Movies with very short plots (<100 chars): 2522 Movies with very long plots (>5000 chars): 1482 Movies with primarily English text: 21556 Movies with non-English text: 133 Unique plot summaries: 19206 Duplicate plots: 2483 === YEAR-BASED FILTERING ANALYSIS === 1950-1970: 2255 movies (10.4%) 1971-1990: 3708 movies (17.1%) 1991-2010: 8102 movies (37.4%) 2011-2023: 7624 movies (35.2%) === VECTOR DATABASE RECOMMENDATIONS === 1. Text Preprocessing: - Average plot length: 2694 characters - Median plot length: 2910 characters - Optimal chunk size: ~500-1000 characters - Consider truncating plots >3000 characters 2. Quality Filtering Strategy: - Remove movies with missing plots - Remove movies with empty plots - Remove movies with very short plots (<100 chars) - Remove non-English plots - Consider removing movies with ratingCount < 100 3. Year-Based Filtering Options: - Recent focus (2010-2023): High volume, modern topics - Balanced approach (1991-2023): Good coverage, manageable size - Full dataset (1950-2023): Complete but larger === FINAL FILTERED DATASET === Original dataset: 21689 movies Filtered dataset: 19167 movies Data reduction: 2522 movies (11.6%) Filtered dataset saved to: ../../data/processed/movie_plots_filtered.csv === FILTERED DATA SAMPLE === Year range in filtered data: 1950 - 2023 Average plot length in filtered data: 3048 characters Average word count in filtered data: 521 words
In [4]:
# Keep only moves > = 2020, remove duplicates
filtered_df_2020 = filtered_df[filtered_df.year >= 2020][['title', 'year', 'plot']].drop_duplicates()
# Print the shape and the head of it
print(f'Shape of the filtered df: {filtered_df_2020.shape}')
# Let's save a copy of the filtered DataFrame for further analysis
filtered_df_2020.to_csv("../../data/processed/movie_plots_filtered_2020.csv", index=False)
print(f"Filtered dataset for 2020+ saved to: ../../data/processed/movie_plots_filtered_2020.csv")
# Display the first few rows of the filtered DataFrame
print("\n=== FILTERED DATA SAMPLE FOR 2023+ ===")
filtered_df_2020.head()
Shape of the filtered df: (1115, 3) Filtered dataset for 2020+ saved to: ../../data/processed/movie_plots_filtered_2020.csv === FILTERED DATA SAMPLE FOR 2023+ ===
Out[4]:
| title | year | plot | |
|---|---|---|---|
| 481 | the good house | 2021 | Hildy Good, once the most prosperous realtor ... |
| 482 | she said | 2022 | In 2017, 'New York Times' reporter Jodi Kanto... |
| 483 | the menu | 2022 | Foodie Tyler Ledford and his date, Margot Mil... |
| 484 | 80 for brady | 2023 | In 2017, best friends Lou, Trish, Maura, and ... |
| 485 | the last kingdom: seven kings must die | 2023 | Olaf Guthfrithson Anlaf, a Viking king from I... |
In [5]:
%%time
RUN_EMBEDDINGS = True
# Encode plots
plots = filtered_df_2020['plot'].tolist()
if RUN_EMBEDDINGS:
print("Running embeddings generation...")
# Let's extract the movies plots and turn them into embedding using jina-embeddings-v3
from sentence_transformers import SentenceTransformer
import torch
# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
# Load jina-embeddings-v3
model = SentenceTransformer(
'jinaai/jina-embeddings-v3',
device=device,
trust_remote_code=True
)
# Get actual dimension size
dim = model.get_sentence_embedding_dimension()
print(f"Model dimension: {dim}")
print(f"Number of plots to encode: {len(plots)}")
print(f"First plot preview: {plots[0][:100]}...")
# Encode with proper parameters
embeddings = model.encode(
plots,
batch_size=8, # Reduce batch size if memory issues
show_progress_bar=True,
normalize_embeddings=True # Normalize for cosine similarity
)
print(f"Embeddings shape: {embeddings.shape}")
print(f"Sample embedding (first 10 values): {embeddings[0][:10]}")
# Save embeddings to a file
import numpy as np
np.save("../../data/processed/movie_embeddings.npy", embeddings)
print("Embeddings saved to: ../../data/processed/movie_embeddings.npy")
else:
print("Skipping embeddings generation. Load precomputed embeddings.")
# Load precomputed embeddings
embeddings = np.load("../../data/processed/movie_embeddings.npy")
print(f"Loaded embeddings shape: {embeddings.shape}")
print(f"Sample embedding (first 10 values): {embeddings[0][:10]}")
Running embeddings generation...
/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Using device: cuda
flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation. flash_attn is not installed. Using PyTorch native attention implementation.
Model dimension: 1024 Number of plots to encode: 1115 First plot preview: Hildy Good, once the most prosperous realtor in Wendover, has fallen on hard times following her di...
Batches: 100%|██████████| 140/140 [00:26<00:00, 5.23it/s]
Embeddings shape: (1115, 1024) Sample embedding (first 10 values): [-0.03710938 -0.06689453 0.04199219 0.12011719 -0.02429199 -0.0456543 -0.01928711 0.16796875 -0.06835938 -0.05078125] Embeddings saved to: ../../data/processed/movie_embeddings.npy CPU times: user 31.3 s, sys: 1.74 s, total: 33 s Wall time: 37.4 s
In [6]:
# Sanity check
print("\n=== SANITY CHECK ===")
print(f"Number of plots: {len(plots)}")
print(f"Number of embeddings: {embeddings.shape[0]}")
=== SANITY CHECK === Number of plots: 1115 Number of embeddings: 1115
In [ ]: