import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import json
import re

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the data
df = pd.read_csv("../../data/raw/movie_plots.csv")

# Basic dataset info
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()

Dataset Shape: (21689, 11)

Column Names:
['title', 'stars', 'directors', 'year', 'genre', 'runtime', 'ratingCount', 'plot', 'summary', 'imdb_rating', 'source']

Data Types:
title           object
stars           object
directors       object
year             int64
genre           object
runtime          int64
ratingCount      int64
plot            object
summary         object
imdb_rating    float64
source          object
dtype: object

First few rows:

# Check for missing values and data quality
print("Missing Values:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Basic statistics for numerical columns
print("\nNumerical Columns Statistics:")
numerical_cols = ['year', 'runtime', 'ratingCount', 'imdb_rating']
print(df[numerical_cols].describe())

# Check unique values in categorical columns
print("\nUnique Values per Column:")
for col in ['title', 'directors', 'genre', 'source']:
    if col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")

Missing Values:
title             0
stars             0
directors         0
year              0
genre             0
runtime           0
ratingCount       0
plot           2337
summary           0
imdb_rating       0
source            0
dtype: int64

Total missing values: 2337

Duplicate rows: 0

Numerical Columns Statistics:
               year      runtime   ratingCount   imdb_rating
count  21689.000000  21689.00000  2.168900e+04  21689.000000
mean    1999.117433    102.28609  4.076619e+04      6.011610
std       18.340120     18.68148  1.199447e+05      1.125659
min     1950.000000     -1.00000  1.001000e+03      1.200000
25%     1988.000000     90.00000  2.161000e+03      5.400000
50%     2005.000000     99.00000  5.687000e+03      6.200000
75%     2014.000000    110.00000  2.451400e+04      6.800000
max     2023.000000    776.00000  2.727886e+06      9.300000

Unique Values per Column:
title: 20617 unique values
directors: 9863 unique values
genre: 664 unique values
source: 21674 unique values

# Vector database preparation insights - FIXED EXECUTION ORDER
print("=== VECTOR DATABASE PREPARATION INSIGHTS ===\n")

# ENSURE COMPUTED COLUMNS EXIST
if 'plot_length' not in df.columns:
    print("Creating computed columns...")
    df['plot_length'] = df['plot'].astype(str).apply(len)
    df['plot_word_count'] = df['plot'].astype(str).str.split().apply(len)
    df['summary_length'] = df['summary'].astype(str).apply(len)

# Text quality assessment with ACTUAL numbers
print("=== TEXT QUALITY ASSESSMENT ===")
print(f"Total movies: {len(df)}")
print(f"Movies with missing plots: {df['plot'].isnull().sum()}")
print(f"Movies with empty plots: {(df['plot'].astype(str).str.strip() == '').sum()}")
print(f"Movies with very short plots (<100 chars): {(df['plot_length'] < 100).sum()}")
print(f"Movies with very long plots (>5000 chars): {(df['plot_length'] > 5000).sum()}")

# Language detection (basic)
def contains_english(text):
    # Basic check for English characters
    english_chars = len(re.findall(r'[a-zA-Z]', str(text)))
    total_chars = len(re.findall(r'\w', str(text)))
    return english_chars / max(total_chars, 1) > 0.8

# Ensure language detection column exists
if 'is_english' not in df.columns:
    df['is_english'] = df['plot'].apply(contains_english)

print(f"\nMovies with primarily English text: {df['is_english'].sum()}")
print(f"Movies with non-English text: {(~df['is_english']).sum()}")

# Unique content analysis
unique_plots = df['plot'].nunique()
print(f"\nUnique plot summaries: {unique_plots}")
print(f"Duplicate plots: {len(df) - unique_plots}")

# Year-based filtering analysis
print("\n=== YEAR-BASED FILTERING ANALYSIS ===")
year_ranges = {
    '1950-1970': len(df[(df['year'] >= 1950) & (df['year'] <= 1970)]),
    '1971-1990': len(df[(df['year'] >= 1971) & (df['year'] <= 1990)]),
    '1991-2010': len(df[(df['year'] >= 1991) & (df['year'] <= 2010)]),
    '2011-2023': len(df[(df['year'] >= 2011) & (df['year'] <= 2023)])
}

for year_range, count in year_ranges.items():
    percentage = count / len(df) * 100
    print(f"{year_range}: {count} movies ({percentage:.1f}%)")

# Summary statistics for vector database
print("\n=== VECTOR DATABASE RECOMMENDATIONS ===")
print("1. Text Preprocessing:")
print(f"   - Average plot length: {df['plot_length'].mean():.0f} characters")
print(f"   - Median plot length: {df['plot_length'].median():.0f} characters")
print(f"   - Optimal chunk size: ~500-1000 characters")
print(f"   - Consider truncating plots >3000 characters")

print("\n2. Quality Filtering Strategy:")
print("   - Remove movies with missing plots")
print("   - Remove movies with empty plots")
print("   - Remove movies with very short plots (<100 chars)")
print("   - Remove non-English plots")
print("   - Consider removing movies with ratingCount < 100")

print("\n3. Year-Based Filtering Options:")
print("   - Recent focus (2010-2023): High volume, modern topics")
print("   - Balanced approach (1991-2023): Good coverage, manageable size")
print("   - Full dataset (1950-2023): Complete but larger")

# Create filtered dataset for vector database
filtered_df = df[
    (df['plot'].notna()) &
    (df['plot'].astype(str).str.strip() != '') &
    (df['is_english']) &
    (df['ratingCount'] >= 100) &
    (df['plot_length'] >= 100)
].copy()

print(f"\n=== FINAL FILTERED DATASET ===")
print(f"Original dataset: {len(df)} movies")
print(f"Filtered dataset: {len(filtered_df)} movies")
print(f"Data reduction: {len(df) - len(filtered_df)} movies ({(len(df) - len(filtered_df))/len(df)*100:.1f}%)")

# Save filtered dataset
import os
os.makedirs("../../data/processed", exist_ok=True)
filtered_df.to_csv("../../data/processed/movie_plots_filtered.csv", index=False)
print(f"\nFiltered dataset saved to: ../../data/processed/movie_plots_filtered.csv")

# Display sample of filtered data
print("\n=== FILTERED DATA SAMPLE ===")
print(f"Year range in filtered data: {filtered_df['year'].min()} - {filtered_df['year'].max()}")
print(f"Average plot length in filtered data: {filtered_df['plot_length'].mean():.0f} characters")
print(f"Average word count in filtered data: {filtered_df['plot_word_count'].mean():.0f} words")

=== VECTOR DATABASE PREPARATION INSIGHTS ===

Creating computed columns...
=== TEXT QUALITY ASSESSMENT ===
Total movies: 21689
Movies with missing plots: 2337
Movies with empty plots: 133
Movies with very short plots (<100 chars): 2522
Movies with very long plots (>5000 chars): 1482

Movies with primarily English text: 21556
Movies with non-English text: 133

Unique plot summaries: 19206
Duplicate plots: 2483

=== YEAR-BASED FILTERING ANALYSIS ===
1950-1970: 2255 movies (10.4%)
1971-1990: 3708 movies (17.1%)
1991-2010: 8102 movies (37.4%)
2011-2023: 7624 movies (35.2%)

=== VECTOR DATABASE RECOMMENDATIONS ===
1. Text Preprocessing:
   - Average plot length: 2694 characters
   - Median plot length: 2910 characters
   - Optimal chunk size: ~500-1000 characters
   - Consider truncating plots >3000 characters

2. Quality Filtering Strategy:
   - Remove movies with missing plots
   - Remove movies with empty plots
   - Remove movies with very short plots (<100 chars)
   - Remove non-English plots
   - Consider removing movies with ratingCount < 100

3. Year-Based Filtering Options:
   - Recent focus (2010-2023): High volume, modern topics
   - Balanced approach (1991-2023): Good coverage, manageable size
   - Full dataset (1950-2023): Complete but larger

=== FINAL FILTERED DATASET ===
Original dataset: 21689 movies
Filtered dataset: 19167 movies
Data reduction: 2522 movies (11.6%)

Filtered dataset saved to: ../../data/processed/movie_plots_filtered.csv

=== FILTERED DATA SAMPLE ===
Year range in filtered data: 1950 - 2023
Average plot length in filtered data: 3048 characters
Average word count in filtered data: 521 words

# Keep only moves > = 2020, remove duplicates
filtered_df_2020 = filtered_df[filtered_df.year >= 2020][['title', 'year', 'plot']].drop_duplicates()

# Print the shape and the head of it
print(f'Shape of the filtered df: {filtered_df_2020.shape}')

# Let's save a copy of the filtered DataFrame for further analysis
filtered_df_2020.to_csv("../../data/processed/movie_plots_filtered_2020.csv", index=False)
print(f"Filtered dataset for 2020+ saved to: ../../data/processed/movie_plots_filtered_2020.csv")
# Display the first few rows of the filtered DataFrame
print("\n=== FILTERED DATA SAMPLE FOR 2023+ ===")
filtered_df_2020.head()

Shape of the filtered df: (1115, 3)
Filtered dataset for 2020+ saved to: ../../data/processed/movie_plots_filtered_2020.csv

=== FILTERED DATA SAMPLE FOR 2023+ ===

%%time

RUN_EMBEDDINGS = True

# Encode plots
plots = filtered_df_2020['plot'].tolist()

if RUN_EMBEDDINGS:
    print("Running embeddings generation...")
    # Let's extract the movies plots and turn them into embedding using jina-embeddings-v3
    from sentence_transformers import SentenceTransformer
    import torch

    # Check if CUDA is available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load jina-embeddings-v3
    model = SentenceTransformer(
        'jinaai/jina-embeddings-v3', 
        device=device, 
        trust_remote_code=True
    )

    # Get actual dimension size
    dim = model.get_sentence_embedding_dimension()
    print(f"Model dimension: {dim}")


    print(f"Number of plots to encode: {len(plots)}")
    print(f"First plot preview: {plots[0][:100]}...")

    # Encode with proper parameters
    embeddings = model.encode(
        plots, 
        batch_size=8,  # Reduce batch size if memory issues
        show_progress_bar=True,
        normalize_embeddings=True  # Normalize for cosine similarity
    )

    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Sample embedding (first 10 values): {embeddings[0][:10]}")

    # Save embeddings to a file
    import numpy as np
    np.save("../../data/processed/movie_embeddings.npy", embeddings)
    print("Embeddings saved to: ../../data/processed/movie_embeddings.npy")

else:
    print("Skipping embeddings generation. Load precomputed embeddings.")
    # Load precomputed embeddings
    embeddings = np.load("../../data/processed/movie_embeddings.npy")
    print(f"Loaded embeddings shape: {embeddings.shape}")
    print(f"Sample embedding (first 10 values): {embeddings[0][:10]}")

Running embeddings generation...

/share/crsp/lab/pkaiser/ddlin/project-3-ddlin-mids/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Using device: cuda

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.

Model dimension: 1024
Number of plots to encode: 1115
First plot preview:  Hildy Good, once the most prosperous realtor in Wendover, has fallen on hard times following her di...

Batches: 100%|██████████| 140/140 [00:26<00:00,  5.23it/s]

Embeddings shape: (1115, 1024)
Sample embedding (first 10 values): [-0.03710938 -0.06689453  0.04199219  0.12011719 -0.02429199 -0.0456543
 -0.01928711  0.16796875 -0.06835938 -0.05078125]
Embeddings saved to: ../../data/processed/movie_embeddings.npy
CPU times: user 31.3 s, sys: 1.74 s, total: 33 s
Wall time: 37.4 s

# Sanity check
print("\n=== SANITY CHECK ===")
print(f"Number of plots: {len(plots)}")
print(f"Number of embeddings: {embeddings.shape[0]}")

=== SANITY CHECK ===
Number of plots: 1115
Number of embeddings: 1115

	title	stars	directors	year	genre	runtime	ratingCount	plot	summary	imdb_rating	source
0	repulsion	[catherine deneuve, ian hendry, john fraser, y...	[roman polanski]	1965	[drama, horror, thriller]	105	54715	Carol Ledoux, a beautiful and shy Belgian Man...	{{Short description,1965 film by Roman Polansk...	7.6	tt0059646
1	the underworld story	[dan duryea, herbert marshall, gale storm, how...	[cy endfield]	1950	[crime, drama, film-noir]	91	1276	When big-city newspaper reporter Mike Reese (...	{{Short description,1950 film by Cy Endfield}}...	7.0	tt0043088
2	the undertaker and his pals	[warrene ott, james westmoreland, marty friedm...	[t.l.p. swicegood]	1966	[comedy, horror]	63	1186	A motorcycle-riding undertaker and his two bi...	{{Short description,1966 American horror film ...	4.5	tt0061140
3	this property is condemned	[natalie wood, robert redford, charles bronson...	[sydney pollack]	1966	[drama, romance]	110	6034	The film starts with a frame story in which a...	{{Short description,1966 film by Sydney Pollac...	7.0	tt0061089
4	the glass bottom boat	[doris day, rod taylor, arthur godfrey, john m...	[frank tashlin]	1966	[comedy, romance]	110	4908	Axel Nordstrom manages a glass-bottom boat to...	{{Short description,1966 American romantic com...	6.4	tt0060463

	title	year	plot
481	the good house	2021	Hildy Good, once the most prosperous realtor ...
482	she said	2022	In 2017, 'New York Times' reporter Jodi Kanto...
483	the menu	2022	Foodie Tyler Ledford and his date, Margot Mil...
484	80 for brady	2023	In 2017, best friends Lou, Trish, Maura, and ...
485	the last kingdom: seven kings must die	2023	Olaf Guthfrithson Anlaf, a Viking king from I...