In [4]:
# Standard library
from pathlib import Path
import pickle, gc
import warnings

# Data science libraries
import scipy
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization

from PIL import Image
import matplotlib.pyplot as plt

# Suppress warnings
warnings.filterwarnings("ignore")

# Directory and file paths
DATA_DIR = Path("/share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

FP_MULTIOME_TRAIN_INPUTS = RAW_DIR / "train_multi_inputs.h5"
FP_MULTIOME_TRAIN_TARGETS = RAW_DIR / "train_multi_targets.h5"
FP_MULTIOME_TEST_INPUTS = RAW_DIR / "test_multi_inputs.h5"
FP_SUBMISSION = RAW_DIR / "sample_submission.csv"
FP_EVALUATION_IDS = RAW_DIR / "evaluation_ids.csv"
FP_CELL_METADATA = RAW_DIR / "metadata.csv"

# Verbosity
VERBOSE = 0

# Check GPU availability
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available, using CPU")
GPU is not available, using CPU

KERAS for single cell Multiome¶

  • This dataset comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors.
  • From each culture plate at each sampling time point, cells were collected for measurement with two single-cell assays. The first is the 10x Chromium Single Cell Multiome ATAC + Gene Expression technology (Multiome) and the second is the 10x Genomics Single Cell Gene Expression with Feature Barcoding technology technology using the TotalSeq™-B Human Universal Cocktail, V1.0 (CITEseq).
In [ ]:
img = Image.open(DATA_DIR.joinpath("images", "multiome.jpg"))
plt.figure(figsize=(10, 8))
plt.imshow(img)
plt.axis('off')
plt.show()
No description has been provided for this image

Data¶

  • Multiome
    • train/test_multi_inputs.h5 - ATAC-seq peak counts transformed with TF-IDF using the default log(TF) * log(IDF) output (chromatin accessibility), with rows corresponding to cells and columns corresponding to the location of the genome whose level of accessibility is measured, here identified by the genomic coordinates on reference genome GRCh38 provided in the 10x References - 2020-A (July 7, 2020).
    • train_multi_targets.h5 - RNA gene expression levels as library-size normalized and log1p transformed counts for the same cells.

ML question framing¶

  • For Multiome samples: predict RNA expression level from chromatin accessibility.

------ Multiome MODEL ---------¶

------ Sparse Data Generation and Dimension Reduction---------¶

  • scATAC-seq multiome data is highly sparse (>95% empty) and noisy.
  • We will address this by converting the .h5 output to a sparse matrix and applying dimension reduction.
  • These preprocessing steps are essential for downstream machine learning tasks.
  • Implementation details are in the dimension-reduction notebook.
In [23]:
FILE_PATHS = {
    "train_inputs_embeddings": PROCESSED_DIR / "train_multi_inputs_embeddings_512.pkl",
    "test_inputs_embeddings": PROCESSED_DIR / "test_multi_inputs_embeddings_512.pkl",
    "train_targets_embeddings": PROCESSED_DIR / "train_multi_targets_embeddings_512.pkl",
    "svd_inputs_model": PROCESSED_DIR / "svd_inputs_model_512.pkl",
    "svd_targets_model": PROCESSED_DIR / "svd_targets_model_512.pkl",
}

# Function to load pickled file and print shape (if applicable)
def load_pickle(file_path, name):
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    if hasattr(data, "shape"):
        print(f"Loaded {name}: {data.shape}")
    else:
        print(f"Loaded {name}")
    return data

# Load all data
data = {
    key: load_pickle(path, key) for key, path in FILE_PATHS.items() if path.suffix == ".pkl"
}

# Assign to variables for clarity
X = data["train_inputs_embeddings"]
Xt = data["test_inputs_embeddings"]
Y = data["train_targets_embeddings"]
svd_inputs = data["svd_inputs_model"]
svd_targets = data["svd_targets_model"]
Loaded train_inputs_embeddings: (105942, 512)
Loaded test_inputs_embeddings: (55935, 512)
Loaded train_targets_embeddings: (105942, 512)
Loaded svd_inputs_model
Loaded svd_targets_model
In [12]:
# We will take the first 40 components
X = X[:,:40]
X.shape
Out[12]:
(105942, 40)
In [26]:
# Load train input metadata, this are the cells in the multiome dataset
TRAIN_INPUTS_META = PROCESSED_DIR.joinpath("train_multi_inputs_metadata.npz")
meta_index = np.load(TRAIN_INPUTS_META, allow_pickle=True)['index']

metadata_df = pd.read_csv(DATA_DIR.joinpath("raw", "metadata.csv"),index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="multiome"]
meta = metadata_df.reindex(meta_index)
print(f'Shape of meta: {meta.shape}')
meta.head()
Shape of meta: (105942, 4)
Out[26]:
day donor cell_type technology
cell_id
56390cf1b95e 2 32606 NeuP multiome
fc0c60183c33 2 32606 HSC multiome
9b4a87e22ad0 2 32606 MasP multiome
81cccad8cd81 2 32606 HSC multiome
15cb3d85c232 2 32606 MkP multiome
In [27]:
Y.shape, X.shape
Out[27]:
((105942, 512), (105942, 512))

Training for Multiome¶

Model and Parameters¶

  • A basic "wide-and-deep" MLP model.
  • Hyperparameters optimized using Keras Tuner.
  • Further architectural options could be investigated.
  • We seek to predict mRNA expression based on chromatin accessibility.
In [14]:
LR_START = 0.01
BATCH_SIZE = 512

def create_model():
    # regularizers & dropout rate
    reg1 = 9.613e-06
    reg2 = 1e-07
    REG1 = tf.keras.regularizers.l2(reg1)
    REG2 = tf.keras.regularizers.l2(reg2)
    DROP = 0.1

    activation = 'selu'
    inputs = Input(shape=(X.shape[1],))

    # four successive Dense→Dropout blocks
    x0 = Dense(256, kernel_regularizer=REG1, activation=activation)(inputs)
    x0 = Dropout(DROP)(x0)

    x1 = Dense(512, kernel_regularizer=REG1, activation=activation)(x0)
    x1 = Dropout(DROP)(x1)

    x2 = Dense(512, kernel_regularizer=REG1, activation=activation)(x1)
    x2 = Dropout(DROP)(x2)

    x3 = Dense(Y.shape[1], kernel_regularizer=REG1, activation=activation)(x2)
    x3 = Dropout(DROP)(x3)

    # concatenate all four intermediate outputs
    x = Concatenate()([x0, x1, x2, x3])

    # final linear layer to produce Y.shape[1] outputs
    x = Dense(Y.shape[1], kernel_regularizer=REG2, activation='linear')(x)

    return Model(inputs, x)

test_model = create_model()
test_model.summary()
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)        ┃ Output Shape      ┃    Param # ┃ Connected to      ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ input_layer         │ (None, 40)        │          0 │ -                 │
│ (InputLayer)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense (Dense)       │ (None, 256)       │     10,496 │ input_layer[0][0] │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout (Dropout)   │ (None, 256)       │          0 │ dense[0][0]       │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_1 (Dense)     │ (None, 512)       │    131,584 │ dropout[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_1 (Dropout) │ (None, 512)       │          0 │ dense_1[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_2 (Dense)     │ (None, 512)       │    262,656 │ dropout_1[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_2 (Dropout) │ (None, 512)       │          0 │ dense_2[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_3 (Dense)     │ (None, 512)       │    262,656 │ dropout_2[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_3 (Dropout) │ (None, 512)       │          0 │ dense_3[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ concatenate         │ (None, 1792)      │          0 │ dropout[0][0],    │
│ (Concatenate)       │                   │            │ dropout_1[0][0],  │
│                     │                   │            │ dropout_2[0][0],  │
│                     │                   │            │ dropout_3[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_4 (Dense)     │ (None, 512)       │    918,016 │ concatenate[0][0] │
└─────────────────────┴───────────────────┴────────────┴───────────────────┘
 Total params: 1,585,408 (6.05 MB)
 Trainable params: 1,585,408 (6.05 MB)
 Non-trainable params: 0 (0.00 B)
In [41]:
def correlation_score(y_true, y_pred):
    """
    Compute the mean Pearson correlation coefficient between true and predicted values.

    Parameters:
    - y_true: Ground truth values (numpy array or pandas DataFrame) of shape (n_samples, n_targets).
    - y_pred: Predicted values (numpy array or pandas DataFrame) of shape (n_samples, n_targets).

    Returns:
    - float: Mean Pearson correlation coefficient across all samples.
    """
    # Convert pandas DataFrame to numpy array if needed
    if type(y_true) == pd.DataFrame:
        y_true = y_true.values
    if type(y_pred) == pd.DataFrame:
        y_pred = y_pred.values
    
    corrsum = 0  # Initialize sum of correlation coefficients
    # Iterate over each sample (row) to compute correlation
    for i in range(len(y_true)):
        # Calculate Pearson correlation coefficient for the i-th sample
        # np.corrcoef returns a 2x2 matrix; [1,0] is the off-diagonal (true vs. pred)
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    
    # Return the mean correlation across all samples
    return corrsum / len(y_true)
In [43]:
model_dir = DATA_DIR.joinpath("models", "atacseq")
model_dir.mkdir(parents=True, exist_ok=True)

EPOCHS = 500
N_SPLITS = 3
VERBOSE = 0

kf = GroupKFold(n_splits=N_SPLITS)

# To store OOF predictions and metrics
pred_oof = np.zeros_like(Y)
mse_scores = []
corr_scores = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, groups=meta.donor)):
    X_tr, y_tr = X[idx_tr], Y[idx_tr]
    X_va, y_va = X[idx_va], Y[idx_va]

    # Build and compile
    model = create_model()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss='mse'
    )

    # Callbacks
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.9, patience=4, verbose=1)
    es = EarlyStopping(monitor="val_loss", patience=30, mode="min", restore_best_weights=True, verbose=1)

    # Train
    model.fit(
        X_tr, y_tr,
        validation_data=(X_va, y_va),
        epochs=EPOCHS,
        batch_size=256,
        callbacks=[es, lr],
        verbose=VERBOSE,
        shuffle=True
    )

    # Predict
    y_pred = model.predict(X_va)
    pred_oof[idx_va] = y_pred

    # Compute metrics
    mse = mean_squared_error(y_va, y_pred)
    corr = correlation_score(y_va, y_pred)
    mse_scores.append(mse)
    corr_scores.append(corr)

    # Save fold model
    model_path = model_dir / f"model_{fold}.keras"
    model.save(model_path)

    print(f"\n— Fold {fold} —")
    print(f"  MSE       = {mse:.4f}")
    print(f"  Corr      = {corr:.4f}")

    # Clean up
    del X_tr, y_tr, X_va, y_va, y_pred, model
    gc.collect()

# Summary across folds
mean_mse = np.mean(mse_scores)
mean_corr = np.mean(corr_scores)
oof_corr = correlation_score(Y, pred_oof)

print("\n=== Cross‑Validation Summary ===")
print(f"Mean MSE       = {mean_mse:.4f}")
print(f"Mean Corr      = {mean_corr:.4f}")
print(f"Out‑of‑Fold Corr = {oof_corr:.4f}")
Epoch 19: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 23: ReduceLROnPlateau reducing learning rate to 0.0008100000384729356.

Epoch 27: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.

Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0006561000715009868.

Epoch 37: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 44: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.

Epoch 48: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 52: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.

Epoch 56: ReduceLROnPlateau reducing learning rate to 0.00038742052274756136.

Epoch 60: ReduceLROnPlateau reducing learning rate to 0.0003486784757114947.

Epoch 64: ReduceLROnPlateau reducing learning rate to 0.00031381062290165574.

Epoch 68: ReduceLROnPlateau reducing learning rate to 0.0002824295632308349.
Epoch 70: early stopping
Restoring model weights from the end of the best epoch: 40.
1152/1152 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step

— Fold 0 —
  MSE       = 10.5810
  Corr      = 0.9547

Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 21: ReduceLROnPlateau reducing learning rate to 0.0008100000384729356.

Epoch 25: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.

Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0006561000715009868.

Epoch 33: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 40: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.

Epoch 45: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 49: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.

Epoch 53: ReduceLROnPlateau reducing learning rate to 0.00038742052274756136.

Epoch 57: ReduceLROnPlateau reducing learning rate to 0.0003486784757114947.

Epoch 61: ReduceLROnPlateau reducing learning rate to 0.00031381062290165574.

Epoch 65: ReduceLROnPlateau reducing learning rate to 0.0002824295632308349.

Epoch 69: ReduceLROnPlateau reducing learning rate to 0.00025418660952709616.
Epoch 71: early stopping
Restoring model weights from the end of the best epoch: 41.
1107/1107 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step

— Fold 1 —
  MSE       = 10.0715
  Corr      = 0.9573

Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0008100000384729356.

Epoch 35: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.

Epoch 39: ReduceLROnPlateau reducing learning rate to 0.0006561000715009868.

Epoch 43: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 47: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.

Epoch 51: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 55: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.
Epoch 57: early stopping
Restoring model weights from the end of the best epoch: 27.
1054/1054 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step

— Fold 2 —
  MSE       = 10.0670
  Corr      = 0.9561

=== Cross‑Validation Summary ===
Mean MSE       = 10.2398
Mean Corr      = 0.9560
Out‑of‑Fold Corr = 0.9560

Results¶

The model demonstrated excellent performance in predicting gene expression from chromatin accessibility. 📈

  • Mean Out‑of‑Fold Correlation Score: 0.9560
  • Mean Mean Squared Error (MSE): 10.2398

These results show a very high correlation between the predicted and actual gene expression embeddings, indicating that the model successfully learned the relationship between the two data modalities.

In [ ]:
# Public Score: 0.81, predictions have a Pearson correlation coefficient of 0.81 against the public test set.
In [ ]: