# Standard library
from pathlib import Path
import pickle, gc
import warnings

# Data science libraries
import scipy
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization

from PIL import Image
import matplotlib.pyplot as plt

# Suppress warnings
warnings.filterwarnings("ignore")

# Directory and file paths
DATA_DIR = Path("/share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

FP_MULTIOME_TRAIN_INPUTS = RAW_DIR / "train_multi_inputs.h5"
FP_MULTIOME_TRAIN_TARGETS = RAW_DIR / "train_multi_targets.h5"
FP_MULTIOME_TEST_INPUTS = RAW_DIR / "test_multi_inputs.h5"
FP_SUBMISSION = RAW_DIR / "sample_submission.csv"
FP_EVALUATION_IDS = RAW_DIR / "evaluation_ids.csv"
FP_CELL_METADATA = RAW_DIR / "metadata.csv"

# Verbosity
VERBOSE = 0

# Check GPU availability
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available, using CPU")

GPU is not available, using CPU

img = Image.open(DATA_DIR.joinpath("images", "multiome.jpg"))
plt.figure(figsize=(10, 8))
plt.imshow(img)
plt.axis('off')
plt.show()

FILE_PATHS = {
    "train_inputs_embeddings": PROCESSED_DIR / "train_multi_inputs_embeddings_512.pkl",
    "test_inputs_embeddings": PROCESSED_DIR / "test_multi_inputs_embeddings_512.pkl",
    "train_targets_embeddings": PROCESSED_DIR / "train_multi_targets_embeddings_512.pkl",
    "svd_inputs_model": PROCESSED_DIR / "svd_inputs_model_512.pkl",
    "svd_targets_model": PROCESSED_DIR / "svd_targets_model_512.pkl",
}

# Function to load pickled file and print shape (if applicable)
def load_pickle(file_path, name):
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    if hasattr(data, "shape"):
        print(f"Loaded {name}: {data.shape}")
    else:
        print(f"Loaded {name}")
    return data

# Load all data
data = {
    key: load_pickle(path, key) for key, path in FILE_PATHS.items() if path.suffix == ".pkl"
}

# Assign to variables for clarity
X = data["train_inputs_embeddings"]
Xt = data["test_inputs_embeddings"]
Y = data["train_targets_embeddings"]
svd_inputs = data["svd_inputs_model"]
svd_targets = data["svd_targets_model"]

Loaded train_inputs_embeddings: (105942, 512)
Loaded test_inputs_embeddings: (55935, 512)
Loaded train_targets_embeddings: (105942, 512)
Loaded svd_inputs_model
Loaded svd_targets_model

# We will take the first 40 components
X = X[:,:40]
X.shape

(105942, 40)

# Load train input metadata, this are the cells in the multiome dataset
TRAIN_INPUTS_META = PROCESSED_DIR.joinpath("train_multi_inputs_metadata.npz")
meta_index = np.load(TRAIN_INPUTS_META, allow_pickle=True)['index']

metadata_df = pd.read_csv(DATA_DIR.joinpath("raw", "metadata.csv"),index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="multiome"]
meta = metadata_df.reindex(meta_index)
print(f'Shape of meta: {meta.shape}')
meta.head()

Shape of meta: (105942, 4)

Y.shape, X.shape

((105942, 512), (105942, 512))

LR_START = 0.01
BATCH_SIZE = 512

def create_model():
    # regularizers & dropout rate
    reg1 = 9.613e-06
    reg2 = 1e-07
    REG1 = tf.keras.regularizers.l2(reg1)
    REG2 = tf.keras.regularizers.l2(reg2)
    DROP = 0.1

    activation = 'selu'
    inputs = Input(shape=(X.shape[1],))

    # four successive Dense→Dropout blocks
    x0 = Dense(256, kernel_regularizer=REG1, activation=activation)(inputs)
    x0 = Dropout(DROP)(x0)

    x1 = Dense(512, kernel_regularizer=REG1, activation=activation)(x0)
    x1 = Dropout(DROP)(x1)

    x2 = Dense(512, kernel_regularizer=REG1, activation=activation)(x1)
    x2 = Dropout(DROP)(x2)

    x3 = Dense(Y.shape[1], kernel_regularizer=REG1, activation=activation)(x2)
    x3 = Dropout(DROP)(x3)

    # concatenate all four intermediate outputs
    x = Concatenate()([x0, x1, x2, x3])

    # final linear layer to produce Y.shape[1] outputs
    x = Dense(Y.shape[1], kernel_regularizer=REG2, activation='linear')(x)

    return Model(inputs, x)

test_model = create_model()
test_model.summary()

Model: "functional"

┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)        ┃ Output Shape      ┃    Param # ┃ Connected to      ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ input_layer         │ (None, 40)        │          0 │ -                 │
│ (InputLayer)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense (Dense)       │ (None, 256)       │     10,496 │ input_layer[0][0] │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout (Dropout)   │ (None, 256)       │          0 │ dense[0][0]       │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_1 (Dense)     │ (None, 512)       │    131,584 │ dropout[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_1 (Dropout) │ (None, 512)       │          0 │ dense_1[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_2 (Dense)     │ (None, 512)       │    262,656 │ dropout_1[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_2 (Dropout) │ (None, 512)       │          0 │ dense_2[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_3 (Dense)     │ (None, 512)       │    262,656 │ dropout_2[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_3 (Dropout) │ (None, 512)       │          0 │ dense_3[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ concatenate         │ (None, 1792)      │          0 │ dropout[0][0],    │
│ (Concatenate)       │                   │            │ dropout_1[0][0],  │
│                     │                   │            │ dropout_2[0][0],  │
│                     │                   │            │ dropout_3[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_4 (Dense)     │ (None, 512)       │    918,016 │ concatenate[0][0] │
└─────────────────────┴───────────────────┴────────────┴───────────────────┘

 Total params: 1,585,408 (6.05 MB)

 Trainable params: 1,585,408 (6.05 MB)

 Non-trainable params: 0 (0.00 B)

def correlation_score(y_true, y_pred):
    """
    Compute the mean Pearson correlation coefficient between true and predicted values.

    Parameters:
    - y_true: Ground truth values (numpy array or pandas DataFrame) of shape (n_samples, n_targets).
    - y_pred: Predicted values (numpy array or pandas DataFrame) of shape (n_samples, n_targets).

    Returns:
    - float: Mean Pearson correlation coefficient across all samples.
    """
    # Convert pandas DataFrame to numpy array if needed
    if type(y_true) == pd.DataFrame:
        y_true = y_true.values
    if type(y_pred) == pd.DataFrame:
        y_pred = y_pred.values
    
    corrsum = 0  # Initialize sum of correlation coefficients
    # Iterate over each sample (row) to compute correlation
    for i in range(len(y_true)):
        # Calculate Pearson correlation coefficient for the i-th sample
        # np.corrcoef returns a 2x2 matrix; [1,0] is the off-diagonal (true vs. pred)
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    
    # Return the mean correlation across all samples
    return corrsum / len(y_true)

model_dir = DATA_DIR.joinpath("models", "atacseq")
model_dir.mkdir(parents=True, exist_ok=True)

EPOCHS = 500
N_SPLITS = 3
VERBOSE = 0

kf = GroupKFold(n_splits=N_SPLITS)

# To store OOF predictions and metrics
pred_oof = np.zeros_like(Y)
mse_scores = []
corr_scores = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, groups=meta.donor)):
    X_tr, y_tr = X[idx_tr], Y[idx_tr]
    X_va, y_va = X[idx_va], Y[idx_va]

    # Build and compile
    model = create_model()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss='mse'
    )

    # Callbacks
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.9, patience=4, verbose=1)
    es = EarlyStopping(monitor="val_loss", patience=30, mode="min", restore_best_weights=True, verbose=1)

    # Train
    model.fit(
        X_tr, y_tr,
        validation_data=(X_va, y_va),
        epochs=EPOCHS,
        batch_size=256,
        callbacks=[es, lr],
        verbose=VERBOSE,
        shuffle=True
    )

    # Predict
    y_pred = model.predict(X_va)
    pred_oof[idx_va] = y_pred

    # Compute metrics
    mse = mean_squared_error(y_va, y_pred)
    corr = correlation_score(y_va, y_pred)
    mse_scores.append(mse)
    corr_scores.append(corr)

    # Save fold model
    model_path = model_dir / f"model_{fold}.keras"
    model.save(model_path)

    print(f"\n— Fold {fold} —")
    print(f"  MSE       = {mse:.4f}")
    print(f"  Corr      = {corr:.4f}")

    # Clean up
    del X_tr, y_tr, X_va, y_va, y_pred, model
    gc.collect()

# Summary across folds
mean_mse = np.mean(mse_scores)
mean_corr = np.mean(corr_scores)
oof_corr = correlation_score(Y, pred_oof)

print("\n=== Cross‑Validation Summary ===")
print(f"Mean MSE       = {mean_mse:.4f}")
print(f"Mean Corr      = {mean_corr:.4f}")
print(f"Out‑of‑Fold Corr = {oof_corr:.4f}")

Epoch 19: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 23: ReduceLROnPlateau reducing learning rate to 0.0008100000384729356.

Epoch 27: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.

Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0006561000715009868.

Epoch 37: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 44: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.

Epoch 48: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 52: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.

Epoch 56: ReduceLROnPlateau reducing learning rate to 0.00038742052274756136.

Epoch 60: ReduceLROnPlateau reducing learning rate to 0.0003486784757114947.

Epoch 64: ReduceLROnPlateau reducing learning rate to 0.00031381062290165574.

Epoch 68: ReduceLROnPlateau reducing learning rate to 0.0002824295632308349.
Epoch 70: early stopping
Restoring model weights from the end of the best epoch: 40.
1152/1152 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step

— Fold 0 —
  MSE       = 10.5810
  Corr      = 0.9547

Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 21: ReduceLROnPlateau reducing learning rate to 0.0008100000384729356.

Epoch 25: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.

Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0006561000715009868.

Epoch 33: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 40: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.

Epoch 45: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 49: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.

Epoch 53: ReduceLROnPlateau reducing learning rate to 0.00038742052274756136.

Epoch 57: ReduceLROnPlateau reducing learning rate to 0.0003486784757114947.

Epoch 61: ReduceLROnPlateau reducing learning rate to 0.00031381062290165574.

Epoch 65: ReduceLROnPlateau reducing learning rate to 0.0002824295632308349.

Epoch 69: ReduceLROnPlateau reducing learning rate to 0.00025418660952709616.
Epoch 71: early stopping
Restoring model weights from the end of the best epoch: 41.
1107/1107 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step

— Fold 1 —
  MSE       = 10.0715
  Corr      = 0.9573

Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0008100000384729356.

Epoch 35: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.

Epoch 39: ReduceLROnPlateau reducing learning rate to 0.0006561000715009868.

Epoch 43: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 47: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.

Epoch 51: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 55: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.
Epoch 57: early stopping
Restoring model weights from the end of the best epoch: 27.
1054/1054 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step

— Fold 2 —
  MSE       = 10.0670
  Corr      = 0.9561

=== Cross‑Validation Summary ===
Mean MSE       = 10.2398
Mean Corr      = 0.9560
Out‑of‑Fold Corr = 0.9560

# Public Score: 0.81, predictions have a Pearson correlation coefficient of 0.81 against the public test set.

	day	donor	cell_type	technology
cell_id
56390cf1b95e	2	32606	NeuP	multiome
fc0c60183c33	2	32606	HSC	multiome
9b4a87e22ad0	2	32606	MasP	multiome
81cccad8cd81	2	32606	HSC	multiome
15cb3d85c232	2	32606	MkP	multiome

KERAS for single cell Multiome¶

Data¶

ML question framing¶

------ Multiome MODEL ---------¶

------ Sparse Data Generation and Dimension Reduction---------¶

Training for Multiome¶

Model and Parameters¶

Results¶