In [1]:
# Standard library
import gc
import warnings
from typing import Union, Optional, Tuple, List, Dict, Any
from pathlib import Path
import pickle
# Core data science
import numpy as np
import pandas as pd
import scipy
from sklearn.decomposition import TruncatedSVD, PCA
warnings.filterwarnings("ignore")
DATA_DIR = Path("/share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data")
RAW_DIR = DATA_DIR.joinpath("raw")
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
FP_CITE_TRAIN_INPUTS = RAW_DIR.joinpath("train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = RAW_DIR.joinpath("train_cite_targets.h5")
FP_CITE_TEST_INPUTS = RAW_DIR.joinpath("test_cite_inputs.h5")
FP_MULTIOME_TRAIN_INPUT = RAW_DIR.joinpath("train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = RAW_DIR.joinpath("train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = RAW_DIR.joinpath("test_multi_inputs.h5")
FP_SUBMISSION = RAW_DIR.joinpath("sample_submission.csv")
FP_EVALUATION_IDS = RAW_DIR.joinpath("evaluation_ids.csv")
FP_CELL_METADATA = RAW_DIR.joinpath("metadata.csv")
VERBOSE = 0
------ Sparse Data Generation and Dimension Reduction---------¶
- The multiome data from scATAC seq is sparce (>95% empty) and noisy
- We will handle it by converting the .h5 output to a sparce matrix, followed by dimension reduction
- This data preprocessing are critical for the downstream ML works
------ Scipy Sparce Matrices ---------¶
In [8]:
class MemoryMonitor:
"""Context manager for monitoring memory usage during processing"""
def __init__(self, operation_name: str):
self.operation_name = operation_name
self.initial_memory = None
def __enter__(self):
gc.collect() # Clean up before measurement
self.initial_memory = psutil.virtual_memory().used / (1024**3) # GB
print(f"Starting {self.operation_name} - Initial memory: {self.initial_memory:.2f} GB")
return self
def __exit__(self, exc_type, exc_val, exc_tb):
gc.collect()
final_memory = psutil.virtual_memory().used / (1024**3) # GB
print(f"Completed {self.operation_name} - Final memory: {final_memory:.2f} GB "
f"(Δ: {final_memory - self.initial_memory:+.2f} GB)")
def get_optimal_chunksize(filename: str, target_memory_gb: float = 2.0) -> int:
"""
Calculate optimal chunk size based on file size and available memory.
Args:
filename: Path to HDF5 file
target_memory_gb: Target memory usage per chunk in GB
Returns:
Optimal chunk size
"""
try:
with pd.HDFStore(filename, mode='r') as store:
# Get basic info about the dataset
info = store.get_storer(list(store.keys())[0])
total_rows = info.nrows
total_cols = info.ncols
# Estimate memory per row (assuming float64, ~8 bytes per element)
bytes_per_row = total_cols * 8
target_bytes = target_memory_gb * (1024**3)
optimal_chunk = int(target_bytes / bytes_per_row)
# Ensure reasonable bounds
optimal_chunk = max(1000, min(optimal_chunk, 10000))
print(f"Dataset info: {total_rows:,} rows × {total_cols:,} cols")
print(f"Calculated optimal chunk size: {optimal_chunk:,}")
return optimal_chunk
except Exception as e:
print(f"Could not determine optimal chunk size: {e}")
return 2500 # fallback to original
def convert_to_parquet(filename: str, output_name: Optional[str] = None) -> str:
"""
Convert CSV to Parquet format with memory optimization.
Args:
filename: Input CSV filename
output_name: Output filename (without extension). If None, derived from input.
Returns:
Path to output parquet file
"""
input_path = Path(filename)
if output_name is None:
output_name = input_path.stem
output_path = PROCESSED_DIR / f"{output_name}.parquet"
with MemoryMonitor(f"CSV→Parquet conversion: {input_path.name}"):
try:
# Use chunked reading for large CSVs
file_size_mb = input_path.stat().st_size / (1024**2)
if file_size_mb > 500: # Use chunked reading for files > 500MB
print(f"Large file detected ({file_size_mb:.1f} MB), using chunked processing...")
chunk_list = []
chunksize = 50000
for i, chunk in enumerate(pd.read_csv(filename, chunksize=chunksize)):
chunk_list.append(chunk)
if (i + 1) % 10 == 0:
print(f" Processed {(i + 1) * chunksize:,} rows...")
df = pd.concat(chunk_list, ignore_index=True)
del chunk_list # Free memory
else:
df = pd.read_csv(filename)
# Write to parquet with compression
df.to_parquet(output_path, compression='snappy', index=False)
print(f"Successfully converted to: {output_path}")
print(f"Output size: {output_path.stat().st_size / (1024**2):.1f} MB")
return str(output_path)
except Exception as e:
print(f"Error in CSV conversion: {e}")
raise
def convert_h5_to_sparse_csr(
filename: str,
output_name: Optional[str] = None,
chunksize: Optional[int] = None,
sparsity_threshold: float = 0.1,
compression: bool = True
) -> Tuple[str, str]:
"""
Convert HDF5 file to sparse CSR format with optimized memory usage.
Args:
filename: Input HDF5 filename
output_name: Output filename prefix (without extension). If None, derived from input.
chunksize: Chunk size for processing. If None, auto-calculated.
sparsity_threshold: Minimum sparsity ratio to justify sparse conversion
compression: Whether to compress output files
Returns:
Tuple of (sparse_values_path, metadata_path)
"""
input_path = Path(filename)
if output_name is None:
output_name = input_path.stem
# Output paths in processed directory
sparse_path = PROCESSED_DIR / f"{output_name}_values.npz"
metadata_path = PROCESSED_DIR / f"{output_name}_metadata.npz"
with MemoryMonitor(f"HDF5→Sparse conversion: {input_path.name}"):
# Auto-calculate optimal chunk size if not provided
if chunksize is None:
chunksize = get_optimal_chunksize(filename)
# Initialize tracking variables
start_idx = 0
total_rows = 0
sparse_chunks: List[scipy.sparse.csr_matrix] = []
index_chunks: List[np.ndarray] = []
columns_name = None
sparsity_ratios = []
print(f"Starting processing with chunk size: {chunksize:,}")
# Process file in chunks
chunk_num = 0
while True:
try:
# Read chunk with explicit bounds checking
chunk_df = pd.read_hdf(
filename,
start=start_idx,
stop=start_idx + chunksize
)
if len(chunk_df) == 0:
print("Reached end of file")
break
chunk_num += 1
# Handle columns on first iteration
if columns_name is None:
columns_name = chunk_df.columns.to_numpy()
print(f"Dataset columns: {len(columns_name):,}")
else:
# Verify column consistency
if not np.array_equal(columns_name, chunk_df.columns.to_numpy()):
raise ValueError(f"Column mismatch in chunk {chunk_num}")
# Convert to numpy and check sparsity
chunk_array = chunk_df.to_numpy()
zero_ratio = (chunk_array == 0).mean()
sparsity_ratios.append(zero_ratio)
# Only convert to sparse if sufficiently sparse
if zero_ratio >= sparsity_threshold:
chunk_sparse = scipy.sparse.csr_matrix(chunk_array)
else:
# For dense data, still create sparse matrix but warn
chunk_sparse = scipy.sparse.csr_matrix(chunk_array)
if chunk_num == 1: # Only warn once
print(f"Warning: Data appears dense (sparsity: {zero_ratio:.3f}). "
f"Sparse format may not be optimal.")
sparse_chunks.append(chunk_sparse)
index_chunks.append(chunk_df.index.to_numpy())
total_rows += len(chunk_df)
# Progress reporting
print(f"Chunk {chunk_num}: {len(chunk_df):,} rows, "
f"sparsity: {zero_ratio:.3f}, "
f"total: {total_rows:,}")
# Clean up chunk data
del chunk_df, chunk_array
# Check if this was the last chunk
if len(sparse_chunks[-1].toarray()) < chunksize:
print("Reached final chunk")
break
start_idx += chunksize
# Periodic garbage collection for large datasets
if chunk_num % 10 == 0:
gc.collect()
current_memory = psutil.virtual_memory().used / (1024**3)
print(f" Memory check at chunk {chunk_num}: {current_memory:.2f} GB")
except Exception as e:
print(f"Error processing chunk {chunk_num} (start: {start_idx}): {e}")
raise
if not sparse_chunks:
raise ValueError("No data was processed!")
print(f"\nCombining {len(sparse_chunks)} chunks...")
# Combine all sparse chunks efficiently
try:
combined_sparse = scipy.sparse.vstack(sparse_chunks, format='csr')
del sparse_chunks # Free memory immediately
gc.collect()
combined_indices = np.concatenate(index_chunks)
del index_chunks
gc.collect()
except MemoryError:
print("Memory error during combination. Trying sequential approach...")
# Fallback: combine in smaller groups
result = sparse_chunks[0]
result_indices = index_chunks[0]
for i in range(1, len(sparse_chunks)):
result = scipy.sparse.vstack([result, sparse_chunks[i]], format='csr')
result_indices = np.concatenate([result_indices, index_chunks[i]])
if i % 5 == 0: # Clean up every 5 iterations
gc.collect()
combined_sparse = result
combined_indices = result_indices
del sparse_chunks, index_chunks
gc.collect()
# Calculate and report statistics
final_sparsity = (combined_sparse.data == 0).sum() / combined_sparse.nnz if combined_sparse.nnz > 0 else 0
avg_sparsity = np.mean(sparsity_ratios)
print(f"\nConversion complete!")
print(f" Final shape: {combined_sparse.shape}")
print(f" Average chunk sparsity: {avg_sparsity:.3f}")
print(f" Final sparsity: {final_sparsity:.3f}")
print(f" Non-zero elements: {combined_sparse.nnz:,}")
# Save sparse matrix with compression
print(f"Saving sparse matrix to: {sparse_path}")
if compression:
scipy.sparse.save_npz(sparse_path, combined_sparse, compressed=True)
else:
scipy.sparse.save_npz(sparse_path, combined_sparse, compressed=False)
# Save metadata with comprehensive information
print(f"Saving metadata to: {metadata_path}")
metadata = {
'index': combined_indices,
'columns': columns_name,
'shape': combined_sparse.shape,
'nnz': combined_sparse.nnz,
'sparsity': avg_sparsity,
'dtype': str(combined_sparse.dtype),
'format': combined_sparse.format,
'created_timestamp': datetime.datetime.now().isoformat(),
'source_file': str(input_path),
'chunk_size_used': chunksize
}
if compression:
np.savez_compressed(metadata_path, **metadata)
else:
np.savez(metadata_path, **metadata)
# Report file sizes
sparse_size_mb = sparse_path.stat().st_size / (1024**2)
metadata_size_mb = metadata_path.stat().st_size / (1024**2)
print(f"\nOutput files:")
print(f" Sparse data: {sparse_size_mb:.1f} MB")
print(f" Metadata: {metadata_size_mb:.1f} MB")
print(f" Total: {sparse_size_mb + metadata_size_mb:.1f} MB")
return str(sparse_path), str(metadata_path)
def load_sparse_data(sparse_path: str, metadata_path: str) -> Tuple[scipy.sparse.csr_matrix, np.ndarray, np.ndarray]:
"""
Load sparse matrix and associated metadata.
Args:
sparse_path: Path to sparse matrix file
metadata_path: Path to metadata file
Returns:
Tuple of (sparse_matrix, index_array, columns_array)
"""
with MemoryMonitor("Loading sparse data"):
sparse_matrix = scipy.sparse.load_npz(sparse_path)
metadata = np.load(metadata_path, allow_pickle=True)
index_array = metadata['index']
columns_array = metadata['columns']
print(f"Loaded sparse matrix: {sparse_matrix.shape}")
print(f"Data type: {sparse_matrix.dtype}")
print(f"Sparsity: {metadata.get('sparsity', 'unknown')}")
return sparse_matrix, index_array, columns_array
def batch_process_files(
input_files: List[str],
output_prefix: str,
chunksize: Optional[int] = None
) -> None:
"""
Process multiple HDF5 files in batch with optimized settings.
Args:
input_files: List of input HDF5 file paths
output_prefix: Prefix for output files
chunksize: Chunk size for processing
"""
print(f"Starting batch processing of {len(input_files)} files...")
for i, filepath in enumerate(input_files, 1):
print(f"\n{'='*60}")
print(f"Processing file {i}/{len(input_files)}: {Path(filepath).name}")
print(f"{'='*60}")
output_name = f"{output_prefix}_{Path(filepath).stem}"
try:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
filepath,
output_name=output_name,
chunksize=chunksize
)
print(f"✓ Successfully processed: {filepath}")
except Exception as e:
print(f"✗ Error processing {filepath}: {e}")
continue
print(f"\nBatch processing complete!")
In [10]:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
str(FP_MULTIOME_TRAIN_INPUT),
output_name="train_multi_inputs",
)
Starting HDF5→Sparse conversion: train_multi_inputs.h5 - Initial memory: 94.19 GB Could not determine optimal chunk size: 'FrameFixed' object has no attribute 'ncols' Starting processing with chunk size: 2,500 Dataset columns: 228,942 Chunk 1: 2,500 rows, sparsity: 0.978, total: 2,500 Chunk 2: 2,500 rows, sparsity: 0.979, total: 5,000 Chunk 3: 2,500 rows, sparsity: 0.978, total: 7,500 Chunk 4: 2,500 rows, sparsity: 0.970, total: 10,000 Chunk 5: 2,500 rows, sparsity: 0.970, total: 12,500 Chunk 6: 2,500 rows, sparsity: 0.970, total: 15,000 Chunk 7: 2,500 rows, sparsity: 0.975, total: 17,500 Chunk 8: 2,500 rows, sparsity: 0.977, total: 20,000 Chunk 9: 2,500 rows, sparsity: 0.977, total: 22,500 Chunk 10: 2,500 rows, sparsity: 0.977, total: 25,000 Memory check at chunk 10: 95.53 GB Chunk 11: 2,500 rows, sparsity: 0.978, total: 27,500 Chunk 12: 2,500 rows, sparsity: 0.979, total: 30,000 Chunk 13: 2,500 rows, sparsity: 0.979, total: 32,500 Chunk 14: 2,500 rows, sparsity: 0.978, total: 35,000 Chunk 15: 2,500 rows, sparsity: 0.978, total: 37,500 Chunk 16: 2,500 rows, sparsity: 0.978, total: 40,000 Chunk 17: 2,500 rows, sparsity: 0.978, total: 42,500 Chunk 18: 2,500 rows, sparsity: 0.970, total: 45,000 Chunk 19: 2,500 rows, sparsity: 0.967, total: 47,500 Chunk 20: 2,500 rows, sparsity: 0.967, total: 50,000 Memory check at chunk 20: 96.79 GB Chunk 21: 2,500 rows, sparsity: 0.968, total: 52,500 Chunk 22: 2,500 rows, sparsity: 0.977, total: 55,000 Chunk 23: 2,500 rows, sparsity: 0.977, total: 57,500 Chunk 24: 2,500 rows, sparsity: 0.977, total: 60,000 Chunk 25: 2,500 rows, sparsity: 0.977, total: 62,500 Chunk 26: 2,500 rows, sparsity: 0.978, total: 65,000 Chunk 27: 2,500 rows, sparsity: 0.978, total: 67,500 Chunk 28: 2,500 rows, sparsity: 0.978, total: 70,000 Chunk 29: 2,500 rows, sparsity: 0.979, total: 72,500 Chunk 30: 2,500 rows, sparsity: 0.979, total: 75,000 Memory check at chunk 30: 97.96 GB Chunk 31: 2,500 rows, sparsity: 0.972, total: 77,500 Chunk 32: 2,500 rows, sparsity: 0.966, total: 80,000 Chunk 33: 2,500 rows, sparsity: 0.965, total: 82,500 Chunk 34: 2,500 rows, sparsity: 0.965, total: 85,000 Chunk 35: 2,500 rows, sparsity: 0.966, total: 87,500 Chunk 36: 2,500 rows, sparsity: 0.976, total: 90,000 Chunk 37: 2,500 rows, sparsity: 0.976, total: 92,500 Chunk 38: 2,500 rows, sparsity: 0.976, total: 95,000 Chunk 39: 2,500 rows, sparsity: 0.976, total: 97,500 Chunk 40: 2,500 rows, sparsity: 0.978, total: 100,000 Memory check at chunk 40: 99.37 GB Chunk 41: 2,500 rows, sparsity: 0.979, total: 102,500 Chunk 42: 2,500 rows, sparsity: 0.979, total: 105,000 Chunk 43: 942 rows, sparsity: 0.979, total: 105,942 Reached final chunk Combining 43 chunks... Conversion complete! Final shape: (105942, 228942) Average chunk sparsity: 0.975 Final sparsity: 0.000 Non-zero elements: 607,301,546 Saving sparse matrix to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_inputs_values.npz Saving metadata to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_inputs_metadata.npz Output files: Sparse data: 2834.8 MB Metadata: 2.7 MB Total: 2837.4 MB Completed HDF5→Sparse conversion: train_multi_inputs.h5 - Final memory: 100.31 GB (Δ: +6.12 GB)
In [11]:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
str(FP_MULTIOME_TRAIN_TARGETS),
output_name="train_multi_targets",
)
Starting HDF5→Sparse conversion: train_multi_targets.h5 - Initial memory: 95.73 GB Could not determine optimal chunk size: 'FrameFixed' object has no attribute 'ncols' Starting processing with chunk size: 2,500 Dataset columns: 23,418 Chunk 1: 2,500 rows, sparsity: 0.824, total: 2,500 Chunk 2: 2,500 rows, sparsity: 0.826, total: 5,000 Chunk 3: 2,500 rows, sparsity: 0.828, total: 7,500 Chunk 4: 2,500 rows, sparsity: 0.831, total: 10,000 Chunk 5: 2,500 rows, sparsity: 0.832, total: 12,500 Chunk 6: 2,500 rows, sparsity: 0.833, total: 15,000 Chunk 7: 2,500 rows, sparsity: 0.829, total: 17,500 Chunk 8: 2,500 rows, sparsity: 0.827, total: 20,000 Chunk 9: 2,500 rows, sparsity: 0.825, total: 22,500 Chunk 10: 2,500 rows, sparsity: 0.826, total: 25,000 Memory check at chunk 10: 96.47 GB Chunk 11: 2,500 rows, sparsity: 0.838, total: 27,500 Chunk 12: 2,500 rows, sparsity: 0.871, total: 30,000 Chunk 13: 2,500 rows, sparsity: 0.869, total: 32,500 Chunk 14: 2,500 rows, sparsity: 0.850, total: 35,000 Chunk 15: 2,500 rows, sparsity: 0.834, total: 37,500 Chunk 16: 2,500 rows, sparsity: 0.832, total: 40,000 Chunk 17: 2,500 rows, sparsity: 0.832, total: 42,500 Chunk 18: 2,500 rows, sparsity: 0.831, total: 45,000 Chunk 19: 2,500 rows, sparsity: 0.827, total: 47,500 Chunk 20: 2,500 rows, sparsity: 0.827, total: 50,000 Memory check at chunk 20: 97.19 GB Chunk 21: 2,500 rows, sparsity: 0.827, total: 52,500 Chunk 22: 2,500 rows, sparsity: 0.825, total: 55,000 Chunk 23: 2,500 rows, sparsity: 0.825, total: 57,500 Chunk 24: 2,500 rows, sparsity: 0.825, total: 60,000 Chunk 25: 2,500 rows, sparsity: 0.835, total: 62,500 Chunk 26: 2,500 rows, sparsity: 0.855, total: 65,000 Chunk 27: 2,500 rows, sparsity: 0.855, total: 67,500 Chunk 28: 2,500 rows, sparsity: 0.845, total: 70,000 Chunk 29: 2,500 rows, sparsity: 0.829, total: 72,500 Chunk 30: 2,500 rows, sparsity: 0.829, total: 75,000 Memory check at chunk 30: 97.81 GB Chunk 31: 2,500 rows, sparsity: 0.831, total: 77,500 Chunk 32: 2,500 rows, sparsity: 0.835, total: 80,000 Chunk 33: 2,500 rows, sparsity: 0.835, total: 82,500 Chunk 34: 2,500 rows, sparsity: 0.836, total: 85,000 Chunk 35: 2,500 rows, sparsity: 0.836, total: 87,500 Chunk 36: 2,500 rows, sparsity: 0.821, total: 90,000 Chunk 37: 2,500 rows, sparsity: 0.822, total: 92,500 Chunk 38: 2,500 rows, sparsity: 0.823, total: 95,000 Chunk 39: 2,500 rows, sparsity: 0.823, total: 97,500 Chunk 40: 2,500 rows, sparsity: 0.855, total: 100,000 Memory check at chunk 40: 98.57 GB Chunk 41: 2,500 rows, sparsity: 0.869, total: 102,500 Chunk 42: 2,500 rows, sparsity: 0.868, total: 105,000 Chunk 43: 942 rows, sparsity: 0.869, total: 105,942 Reached final chunk Combining 43 chunks... Conversion complete! Final shape: (105942, 23418) Average chunk sparsity: 0.836 Final sparsity: 0.000 Non-zero elements: 407,024,875 Saving sparse matrix to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_targets_values.npz Saving metadata to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_targets_metadata.npz Output files: Sparse data: 824.0 MB Metadata: 0.9 MB Total: 824.8 MB Completed HDF5→Sparse conversion: train_multi_targets.h5 - Final memory: 99.19 GB (Δ: +3.46 GB)
In [12]:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
str(FP_MULTIOME_TEST_INPUTS),
output_name="test_multi_inputs",
)
Starting HDF5→Sparse conversion: test_multi_inputs.h5 - Initial memory: 96.15 GB Could not determine optimal chunk size: 'FrameFixed' object has no attribute 'ncols' Starting processing with chunk size: 2,500 Dataset columns: 228,942 Chunk 1: 2,500 rows, sparsity: 0.978, total: 2,500 Chunk 2: 2,500 rows, sparsity: 0.978, total: 5,000 Chunk 3: 2,500 rows, sparsity: 0.978, total: 7,500 Chunk 4: 2,500 rows, sparsity: 0.974, total: 10,000 Chunk 5: 2,500 rows, sparsity: 0.968, total: 12,500 Chunk 6: 2,500 rows, sparsity: 0.967, total: 15,000 Chunk 7: 2,500 rows, sparsity: 0.970, total: 17,500 Chunk 8: 2,500 rows, sparsity: 0.978, total: 20,000 Chunk 9: 2,500 rows, sparsity: 0.978, total: 22,500 Chunk 10: 2,500 rows, sparsity: 0.976, total: 25,000 Memory check at chunk 10: 97.48 GB Chunk 11: 2,500 rows, sparsity: 0.971, total: 27,500 Chunk 12: 2,500 rows, sparsity: 0.972, total: 30,000 Chunk 13: 2,500 rows, sparsity: 0.972, total: 32,500 Chunk 14: 2,500 rows, sparsity: 0.971, total: 35,000 Chunk 15: 2,500 rows, sparsity: 0.971, total: 37,500 Chunk 16: 2,500 rows, sparsity: 0.971, total: 40,000 Chunk 17: 2,500 rows, sparsity: 0.969, total: 42,500 Chunk 18: 2,500 rows, sparsity: 0.970, total: 45,000 Chunk 19: 2,500 rows, sparsity: 0.970, total: 47,500 Chunk 20: 2,500 rows, sparsity: 0.970, total: 50,000 Memory check at chunk 20: 99.02 GB Chunk 21: 2,500 rows, sparsity: 0.971, total: 52,500 Chunk 22: 2,500 rows, sparsity: 0.970, total: 55,000 Chunk 23: 935 rows, sparsity: 0.970, total: 55,935 Reached final chunk Combining 23 chunks... Conversion complete! Final shape: (55935, 228942) Average chunk sparsity: 0.972 Final sparsity: 0.000 Non-zero elements: 353,572,710 Saving sparse matrix to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/test_multi_inputs_values.npz Saving metadata to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/test_multi_inputs_metadata.npz Output files: Sparse data: 1642.1 MB Metadata: 2.3 MB Total: 1644.4 MB Completed HDF5→Sparse conversion: test_multi_inputs.h5 - Final memory: 76.91 GB (Δ: -19.24 GB)
Dimension reduction with Truncated SVD¶
- We will use the sparce matrices that we just created
In [9]:
sparse_matrix = scipy.sparse.load_npz(PROCESSED_DIR.joinpath("train_multi_inputs_values.npz"))
# Load metadata
metadata = np.load(
PROCESSED_DIR.joinpath("train_multi_inputs_metadata.npz"),
allow_pickle=True
)
index = metadata['index']
columns = metadata['columns']
In [13]:
# Rows are cells and cols are genomic regions
sparse_matrix
Out[13]:
<Compressed Sparse Row sparse matrix of dtype 'float32' with 607301546 stored elements and shape (105942, 228942)>
In [14]:
# OK these are the cells
index
Out[14]:
array(['56390cf1b95e', 'fc0c60183c33', '9b4a87e22ad0', ..., '00783f28b463', 'e7abb1a0f251', '193992d571a5'], dtype=object)
In [ ]:
# These are genomic regions
columns
Out[ ]:
array(['GL000194.1:114519-115365', 'GL000194.1:55758-56597', 'GL000194.1:58217-58957', ..., 'chrY:7836768-7837671', 'chrY:7869454-7870371', 'chrY:7873814-7874709'], dtype=object)
In [18]:
RUN_SVD = True # Set to False if you want to load precomputed
# Dimension of SVD
n_components = 512
# Paths for saving/loading the reduced arrays and SVD objects (using "embeddings" and "model" for clarity)
TRAIN_INPUTS_EMBEDDINGS_PKL = PROCESSED_DIR / "train_multi_inputs_embeddings_512.pkl"
TEST_INPUTS_EMBEDDINGS_PKL = PROCESSED_DIR / "test_multi_inputs_embeddings_512.pkl"
TRAIN_TARGETS_EMBEDDINGS_PKL = PROCESSED_DIR / "train_multi_targets_embeddings_512.pkl"
SVD_INPUTS_MODEL_PKL = PROCESSED_DIR / "svd_inputs_model_512.pkl"
SVD_TARGETS_MODEL_PKL = PROCESSED_DIR / "svd_targets_model_512.pkl"
if RUN_SVD:
print("Computing SVD embeddings...")
# Process inputs first (load one at a time to save memory)
# Load train inputs sparse matrix
train_multi_inputs = scipy.sparse.load_npz(PROCESSED_DIR / "train_multi_inputs_values.npz")
# SVD for inputs: fit on train
svd_inputs = TruncatedSVD(n_components=n_components, random_state=42)
X = svd_inputs.fit_transform(train_multi_inputs)
# Save train embeddings and model so far
with open(TRAIN_INPUTS_EMBEDDINGS_PKL, "wb") as f:
pickle.dump(X, f)
with open(SVD_INPUTS_MODEL_PKL, "wb") as f:
pickle.dump(svd_inputs, f)
# Cleanup train inputs
del train_multi_inputs, X
gc.collect()
# Load test inputs sparse matrix
test_multi_inputs = scipy.sparse.load_npz(PROCESSED_DIR / "test_multi_inputs_values.npz")
# Transform test
Xt = svd_inputs.transform(test_multi_inputs)
# Save test embeddings
with open(TEST_INPUTS_EMBEDDINGS_PKL, "wb") as f:
pickle.dump(Xt, f)
# Cleanup test inputs
del test_multi_inputs, Xt
gc.collect()
# Now process targets (only train targets available)
# Load train targets sparse matrix
train_multi_targets = scipy.sparse.load_npz(PROCESSED_DIR / "train_multi_targets_values.npz")
# SVD for targets: fit on train
svd_targets = TruncatedSVD(n_components=n_components, random_state=42)
Y = svd_targets.fit_transform(train_multi_targets)
# Save targets embeddings and model
with open(TRAIN_TARGETS_EMBEDDINGS_PKL, "wb") as f:
pickle.dump(Y, f)
with open(SVD_TARGETS_MODEL_PKL, "wb") as f:
pickle.dump(svd_targets, f)
# Cleanup
del train_multi_targets, Y
gc.collect()
print("Computed and saved SVD embeddings (512 PCs).")
else:
# Load precomputed reduced arrays
with open(TRAIN_INPUTS_EMBEDDINGS_PKL, "rb") as f:
X = pickle.load(f)
with open(TEST_INPUTS_EMBEDDINGS_PKL, "rb") as f:
Xt = pickle.load(f)
with open(TRAIN_TARGETS_EMBEDDINGS_PKL, "rb") as f:
Y = pickle.load(f)
# Load precomputed SVD objects
with open(SVD_INPUTS_MODEL_PKL, "rb") as f:
svd_inputs = pickle.load(f)
with open(SVD_TARGETS_MODEL_PKL, "rb") as f:
svd_targets = pickle.load(f)
print(f"Loaded precomputed SVD embeddings X (512 PCs): {X.shape}, Xt: {Xt.shape}, Y: {Y.shape}")
Computing SVD embeddings... Computed and saved SVD embeddings (512 PCs).
Greate Parquet files¶
- Parquet is a more efficient format than csv
- After conversion, simply update your data loading calls from pd.read_csv('filename.csv') to pd.read_parquet('filename.parquet') when reading into DataFrames.
In [11]:
METADATA = RAW_DIR.joinpath("metadata.csv")
EVAL = RAW_DIR.joinpath("evaluation_ids.csv")
SUBMIT = RAW_DIR.joinpath("sample_submission.csv")
def convert_to_parquet(path_to_csv, out_filename):
df = pd.read_csv(path_to_csv)
df.to_parquet(PROCESSED_DIR.joinpath(f"{out_filename}.parquet"))
In [12]:
convert_to_parquet(METADATA, "metadata")
In [13]:
convert_to_parquet(EVAL, "evaluation")
In [14]:
convert_to_parquet(SUBMIT, "sample_submission")
In [ ]: