In [1]:
# Standard library
import gc
import warnings
from typing import Union, Optional, Tuple, List, Dict, Any
from pathlib import Path
import pickle


# Core data science
import numpy as np
import pandas as pd
import scipy

from sklearn.decomposition import TruncatedSVD, PCA

warnings.filterwarnings("ignore")

DATA_DIR = Path("/share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data")
RAW_DIR = DATA_DIR.joinpath("raw")
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

FP_CITE_TRAIN_INPUTS  = RAW_DIR.joinpath("train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = RAW_DIR.joinpath("train_cite_targets.h5")
FP_CITE_TEST_INPUTS   = RAW_DIR.joinpath("test_cite_inputs.h5")

FP_MULTIOME_TRAIN_INPUT = RAW_DIR.joinpath("train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = RAW_DIR.joinpath("train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS   = RAW_DIR.joinpath("test_multi_inputs.h5")

FP_SUBMISSION         = RAW_DIR.joinpath("sample_submission.csv")
FP_EVALUATION_IDS     = RAW_DIR.joinpath("evaluation_ids.csv")
FP_CELL_METADATA      = RAW_DIR.joinpath("metadata.csv")

VERBOSE = 0

------ Sparse Data Generation and Dimension Reduction---------¶

  • The multiome data from scATAC seq is sparce (>95% empty) and noisy
  • We will handle it by converting the .h5 output to a sparce matrix, followed by dimension reduction
  • This data preprocessing are critical for the downstream ML works

------ Scipy Sparce Matrices ---------¶

In [8]:
class MemoryMonitor:
    """Context manager for monitoring memory usage during processing"""
    
    def __init__(self, operation_name: str):
        self.operation_name = operation_name
        self.initial_memory = None
        
    def __enter__(self):
        gc.collect()  # Clean up before measurement
        self.initial_memory = psutil.virtual_memory().used / (1024**3)  # GB
        print(f"Starting {self.operation_name} - Initial memory: {self.initial_memory:.2f} GB")
        return self
        
    def __exit__(self, exc_type, exc_val, exc_tb):
        gc.collect()
        final_memory = psutil.virtual_memory().used / (1024**3)  # GB
        print(f"Completed {self.operation_name} - Final memory: {final_memory:.2f} GB "
              f"(Δ: {final_memory - self.initial_memory:+.2f} GB)")


def get_optimal_chunksize(filename: str, target_memory_gb: float = 2.0) -> int:
    """
    Calculate optimal chunk size based on file size and available memory.
    
    Args:
        filename: Path to HDF5 file
        target_memory_gb: Target memory usage per chunk in GB
        
    Returns:
        Optimal chunk size
    """
    try:
        with pd.HDFStore(filename, mode='r') as store:
            # Get basic info about the dataset
            info = store.get_storer(list(store.keys())[0])
            total_rows = info.nrows
            total_cols = info.ncols
            
        # Estimate memory per row (assuming float64, ~8 bytes per element)
        bytes_per_row = total_cols * 8
        target_bytes = target_memory_gb * (1024**3)
        optimal_chunk = int(target_bytes / bytes_per_row)
        
        # Ensure reasonable bounds
        optimal_chunk = max(1000, min(optimal_chunk, 10000))
        
        print(f"Dataset info: {total_rows:,} rows × {total_cols:,} cols")
        print(f"Calculated optimal chunk size: {optimal_chunk:,}")
        
        return optimal_chunk
        
    except Exception as e:
        print(f"Could not determine optimal chunk size: {e}")
        return 2500  # fallback to original


def convert_to_parquet(filename: str, output_name: Optional[str] = None) -> str:
    """
    Convert CSV to Parquet format with memory optimization.
    
    Args:
        filename: Input CSV filename
        output_name: Output filename (without extension). If None, derived from input.
        
    Returns:
        Path to output parquet file
    """
    input_path = Path(filename)
    
    if output_name is None:
        output_name = input_path.stem
    
    output_path = PROCESSED_DIR / f"{output_name}.parquet"
    
    with MemoryMonitor(f"CSV→Parquet conversion: {input_path.name}"):
        try:
            # Use chunked reading for large CSVs
            file_size_mb = input_path.stat().st_size / (1024**2)
            
            if file_size_mb > 500:  # Use chunked reading for files > 500MB
                print(f"Large file detected ({file_size_mb:.1f} MB), using chunked processing...")
                
                chunk_list = []
                chunksize = 50000
                
                for i, chunk in enumerate(pd.read_csv(filename, chunksize=chunksize)):
                    chunk_list.append(chunk)
                    if (i + 1) % 10 == 0:
                        print(f"  Processed {(i + 1) * chunksize:,} rows...")
                
                df = pd.concat(chunk_list, ignore_index=True)
                del chunk_list  # Free memory
                
            else:
                df = pd.read_csv(filename)
            
            # Write to parquet with compression
            df.to_parquet(output_path, compression='snappy', index=False)
            
            print(f"Successfully converted to: {output_path}")
            print(f"Output size: {output_path.stat().st_size / (1024**2):.1f} MB")
            
            return str(output_path)
            
        except Exception as e:
            print(f"Error in CSV conversion: {e}")
            raise


def convert_h5_to_sparse_csr(
    filename: str, 
    output_name: Optional[str] = None,
    chunksize: Optional[int] = None,
    sparsity_threshold: float = 0.1,
    compression: bool = True
) -> Tuple[str, str]:
    """
    Convert HDF5 file to sparse CSR format with optimized memory usage.
    
    Args:
        filename: Input HDF5 filename  
        output_name: Output filename prefix (without extension). If None, derived from input.
        chunksize: Chunk size for processing. If None, auto-calculated.
        sparsity_threshold: Minimum sparsity ratio to justify sparse conversion
        compression: Whether to compress output files
        
    Returns:
        Tuple of (sparse_values_path, metadata_path)
    """
    input_path = Path(filename)
    
    if output_name is None:
        output_name = input_path.stem
    
    # Output paths in processed directory
    sparse_path = PROCESSED_DIR / f"{output_name}_values.npz"
    metadata_path = PROCESSED_DIR / f"{output_name}_metadata.npz"
    
    with MemoryMonitor(f"HDF5→Sparse conversion: {input_path.name}"):
        
        # Auto-calculate optimal chunk size if not provided
        if chunksize is None:
            chunksize = get_optimal_chunksize(filename)
        
        # Initialize tracking variables
        start_idx = 0
        total_rows = 0
        sparse_chunks: List[scipy.sparse.csr_matrix] = []
        index_chunks: List[np.ndarray] = []
        columns_name = None
        sparsity_ratios = []
        
        print(f"Starting processing with chunk size: {chunksize:,}")
        
        # Process file in chunks
        chunk_num = 0
        while True:
            try:
                # Read chunk with explicit bounds checking
                chunk_df = pd.read_hdf(
                    filename, 
                    start=start_idx, 
                    stop=start_idx + chunksize
                )
                
                if len(chunk_df) == 0:
                    print("Reached end of file")
                    break
                
                chunk_num += 1
                
                # Handle columns on first iteration
                if columns_name is None:
                    columns_name = chunk_df.columns.to_numpy()
                    print(f"Dataset columns: {len(columns_name):,}")
                else:
                    # Verify column consistency
                    if not np.array_equal(columns_name, chunk_df.columns.to_numpy()):
                        raise ValueError(f"Column mismatch in chunk {chunk_num}")
                
                # Convert to numpy and check sparsity
                chunk_array = chunk_df.to_numpy()
                zero_ratio = (chunk_array == 0).mean()
                sparsity_ratios.append(zero_ratio)
                
                # Only convert to sparse if sufficiently sparse
                if zero_ratio >= sparsity_threshold:
                    chunk_sparse = scipy.sparse.csr_matrix(chunk_array)
                else:
                    # For dense data, still create sparse matrix but warn
                    chunk_sparse = scipy.sparse.csr_matrix(chunk_array)
                    if chunk_num == 1:  # Only warn once
                        print(f"Warning: Data appears dense (sparsity: {zero_ratio:.3f}). "
                              f"Sparse format may not be optimal.")
                
                sparse_chunks.append(chunk_sparse)
                index_chunks.append(chunk_df.index.to_numpy())
                
                total_rows += len(chunk_df)
                
                # Progress reporting
                print(f"Chunk {chunk_num}: {len(chunk_df):,} rows, "
                      f"sparsity: {zero_ratio:.3f}, "
                      f"total: {total_rows:,}")
                
                # Clean up chunk data
                del chunk_df, chunk_array
                
                # Check if this was the last chunk
                if len(sparse_chunks[-1].toarray()) < chunksize:
                    print("Reached final chunk")
                    break
                    
                start_idx += chunksize
                
                # Periodic garbage collection for large datasets
                if chunk_num % 10 == 0:
                    gc.collect()
                    current_memory = psutil.virtual_memory().used / (1024**3)
                    print(f"  Memory check at chunk {chunk_num}: {current_memory:.2f} GB")
                
            except Exception as e:
                print(f"Error processing chunk {chunk_num} (start: {start_idx}): {e}")
                raise
        
        if not sparse_chunks:
            raise ValueError("No data was processed!")
        
        print(f"\nCombining {len(sparse_chunks)} chunks...")
        
        # Combine all sparse chunks efficiently
        try:
            combined_sparse = scipy.sparse.vstack(sparse_chunks, format='csr')
            del sparse_chunks  # Free memory immediately
            gc.collect()
            
            combined_indices = np.concatenate(index_chunks)
            del index_chunks
            gc.collect()
            
        except MemoryError:
            print("Memory error during combination. Trying sequential approach...")
            # Fallback: combine in smaller groups
            result = sparse_chunks[0]
            result_indices = index_chunks[0]
            
            for i in range(1, len(sparse_chunks)):
                result = scipy.sparse.vstack([result, sparse_chunks[i]], format='csr')
                result_indices = np.concatenate([result_indices, index_chunks[i]])
                
                if i % 5 == 0:  # Clean up every 5 iterations
                    gc.collect()
                    
            combined_sparse = result
            combined_indices = result_indices
            del sparse_chunks, index_chunks
            gc.collect()
        
        # Calculate and report statistics
        final_sparsity = (combined_sparse.data == 0).sum() / combined_sparse.nnz if combined_sparse.nnz > 0 else 0
        avg_sparsity = np.mean(sparsity_ratios)
        
        print(f"\nConversion complete!")
        print(f"  Final shape: {combined_sparse.shape}")
        print(f"  Average chunk sparsity: {avg_sparsity:.3f}")
        print(f"  Final sparsity: {final_sparsity:.3f}")
        print(f"  Non-zero elements: {combined_sparse.nnz:,}")
        
        # Save sparse matrix with compression
        print(f"Saving sparse matrix to: {sparse_path}")
        if compression:
            scipy.sparse.save_npz(sparse_path, combined_sparse, compressed=True)
        else:
            scipy.sparse.save_npz(sparse_path, combined_sparse, compressed=False)
        
        # Save metadata with comprehensive information
        print(f"Saving metadata to: {metadata_path}")
        metadata = {
            'index': combined_indices,
            'columns': columns_name,
            'shape': combined_sparse.shape,
            'nnz': combined_sparse.nnz,
            'sparsity': avg_sparsity,
            'dtype': str(combined_sparse.dtype),
            'format': combined_sparse.format,
            'created_timestamp': datetime.datetime.now().isoformat(),
            'source_file': str(input_path),
            'chunk_size_used': chunksize
        }
        
        if compression:
            np.savez_compressed(metadata_path, **metadata)
        else:
            np.savez(metadata_path, **metadata)
        
        # Report file sizes
        sparse_size_mb = sparse_path.stat().st_size / (1024**2)
        metadata_size_mb = metadata_path.stat().st_size / (1024**2)
        
        print(f"\nOutput files:")
        print(f"  Sparse data: {sparse_size_mb:.1f} MB")
        print(f"  Metadata: {metadata_size_mb:.1f} MB")
        print(f"  Total: {sparse_size_mb + metadata_size_mb:.1f} MB")
        
        return str(sparse_path), str(metadata_path)


def load_sparse_data(sparse_path: str, metadata_path: str) -> Tuple[scipy.sparse.csr_matrix, np.ndarray, np.ndarray]:
    """
    Load sparse matrix and associated metadata.
    
    Args:
        sparse_path: Path to sparse matrix file
        metadata_path: Path to metadata file
        
    Returns:
        Tuple of (sparse_matrix, index_array, columns_array)
    """
    with MemoryMonitor("Loading sparse data"):
        sparse_matrix = scipy.sparse.load_npz(sparse_path)
        metadata = np.load(metadata_path, allow_pickle=True)
        
        index_array = metadata['index']
        columns_array = metadata['columns']
        
        print(f"Loaded sparse matrix: {sparse_matrix.shape}")
        print(f"Data type: {sparse_matrix.dtype}")
        print(f"Sparsity: {metadata.get('sparsity', 'unknown')}")
        
        return sparse_matrix, index_array, columns_array


def batch_process_files(
    input_files: List[str], 
    output_prefix: str,
    chunksize: Optional[int] = None
) -> None:
    """
    Process multiple HDF5 files in batch with optimized settings.
    
    Args:
        input_files: List of input HDF5 file paths
        output_prefix: Prefix for output files
        chunksize: Chunk size for processing
    """
    print(f"Starting batch processing of {len(input_files)} files...")
    
    for i, filepath in enumerate(input_files, 1):
        print(f"\n{'='*60}")
        print(f"Processing file {i}/{len(input_files)}: {Path(filepath).name}")
        print(f"{'='*60}")
        
        output_name = f"{output_prefix}_{Path(filepath).stem}"
        
        try:
            sparse_path, metadata_path = convert_h5_to_sparse_csr(
                filepath, 
                output_name=output_name,
                chunksize=chunksize
            )
            
            print(f"✓ Successfully processed: {filepath}")
            
        except Exception as e:
            print(f"✗ Error processing {filepath}: {e}")
            continue
    
    print(f"\nBatch processing complete!")
In [10]:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
    str(FP_MULTIOME_TRAIN_INPUT),
    output_name="train_multi_inputs",
)
Starting HDF5→Sparse conversion: train_multi_inputs.h5 - Initial memory: 94.19 GB
Could not determine optimal chunk size: 'FrameFixed' object has no attribute 'ncols'
Starting processing with chunk size: 2,500
Dataset columns: 228,942
Chunk 1: 2,500 rows, sparsity: 0.978, total: 2,500
Chunk 2: 2,500 rows, sparsity: 0.979, total: 5,000
Chunk 3: 2,500 rows, sparsity: 0.978, total: 7,500
Chunk 4: 2,500 rows, sparsity: 0.970, total: 10,000
Chunk 5: 2,500 rows, sparsity: 0.970, total: 12,500
Chunk 6: 2,500 rows, sparsity: 0.970, total: 15,000
Chunk 7: 2,500 rows, sparsity: 0.975, total: 17,500
Chunk 8: 2,500 rows, sparsity: 0.977, total: 20,000
Chunk 9: 2,500 rows, sparsity: 0.977, total: 22,500
Chunk 10: 2,500 rows, sparsity: 0.977, total: 25,000
  Memory check at chunk 10: 95.53 GB
Chunk 11: 2,500 rows, sparsity: 0.978, total: 27,500
Chunk 12: 2,500 rows, sparsity: 0.979, total: 30,000
Chunk 13: 2,500 rows, sparsity: 0.979, total: 32,500
Chunk 14: 2,500 rows, sparsity: 0.978, total: 35,000
Chunk 15: 2,500 rows, sparsity: 0.978, total: 37,500
Chunk 16: 2,500 rows, sparsity: 0.978, total: 40,000
Chunk 17: 2,500 rows, sparsity: 0.978, total: 42,500
Chunk 18: 2,500 rows, sparsity: 0.970, total: 45,000
Chunk 19: 2,500 rows, sparsity: 0.967, total: 47,500
Chunk 20: 2,500 rows, sparsity: 0.967, total: 50,000
  Memory check at chunk 20: 96.79 GB
Chunk 21: 2,500 rows, sparsity: 0.968, total: 52,500
Chunk 22: 2,500 rows, sparsity: 0.977, total: 55,000
Chunk 23: 2,500 rows, sparsity: 0.977, total: 57,500
Chunk 24: 2,500 rows, sparsity: 0.977, total: 60,000
Chunk 25: 2,500 rows, sparsity: 0.977, total: 62,500
Chunk 26: 2,500 rows, sparsity: 0.978, total: 65,000
Chunk 27: 2,500 rows, sparsity: 0.978, total: 67,500
Chunk 28: 2,500 rows, sparsity: 0.978, total: 70,000
Chunk 29: 2,500 rows, sparsity: 0.979, total: 72,500
Chunk 30: 2,500 rows, sparsity: 0.979, total: 75,000
  Memory check at chunk 30: 97.96 GB
Chunk 31: 2,500 rows, sparsity: 0.972, total: 77,500
Chunk 32: 2,500 rows, sparsity: 0.966, total: 80,000
Chunk 33: 2,500 rows, sparsity: 0.965, total: 82,500
Chunk 34: 2,500 rows, sparsity: 0.965, total: 85,000
Chunk 35: 2,500 rows, sparsity: 0.966, total: 87,500
Chunk 36: 2,500 rows, sparsity: 0.976, total: 90,000
Chunk 37: 2,500 rows, sparsity: 0.976, total: 92,500
Chunk 38: 2,500 rows, sparsity: 0.976, total: 95,000
Chunk 39: 2,500 rows, sparsity: 0.976, total: 97,500
Chunk 40: 2,500 rows, sparsity: 0.978, total: 100,000
  Memory check at chunk 40: 99.37 GB
Chunk 41: 2,500 rows, sparsity: 0.979, total: 102,500
Chunk 42: 2,500 rows, sparsity: 0.979, total: 105,000
Chunk 43: 942 rows, sparsity: 0.979, total: 105,942
Reached final chunk

Combining 43 chunks...

Conversion complete!
  Final shape: (105942, 228942)
  Average chunk sparsity: 0.975
  Final sparsity: 0.000
  Non-zero elements: 607,301,546
Saving sparse matrix to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_inputs_values.npz
Saving metadata to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_inputs_metadata.npz

Output files:
  Sparse data: 2834.8 MB
  Metadata: 2.7 MB
  Total: 2837.4 MB
Completed HDF5→Sparse conversion: train_multi_inputs.h5 - Final memory: 100.31 GB (Δ: +6.12 GB)
In [11]:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
    str(FP_MULTIOME_TRAIN_TARGETS),
    output_name="train_multi_targets",
)
Starting HDF5→Sparse conversion: train_multi_targets.h5 - Initial memory: 95.73 GB
Could not determine optimal chunk size: 'FrameFixed' object has no attribute 'ncols'
Starting processing with chunk size: 2,500
Dataset columns: 23,418
Chunk 1: 2,500 rows, sparsity: 0.824, total: 2,500
Chunk 2: 2,500 rows, sparsity: 0.826, total: 5,000
Chunk 3: 2,500 rows, sparsity: 0.828, total: 7,500
Chunk 4: 2,500 rows, sparsity: 0.831, total: 10,000
Chunk 5: 2,500 rows, sparsity: 0.832, total: 12,500
Chunk 6: 2,500 rows, sparsity: 0.833, total: 15,000
Chunk 7: 2,500 rows, sparsity: 0.829, total: 17,500
Chunk 8: 2,500 rows, sparsity: 0.827, total: 20,000
Chunk 9: 2,500 rows, sparsity: 0.825, total: 22,500
Chunk 10: 2,500 rows, sparsity: 0.826, total: 25,000
  Memory check at chunk 10: 96.47 GB
Chunk 11: 2,500 rows, sparsity: 0.838, total: 27,500
Chunk 12: 2,500 rows, sparsity: 0.871, total: 30,000
Chunk 13: 2,500 rows, sparsity: 0.869, total: 32,500
Chunk 14: 2,500 rows, sparsity: 0.850, total: 35,000
Chunk 15: 2,500 rows, sparsity: 0.834, total: 37,500
Chunk 16: 2,500 rows, sparsity: 0.832, total: 40,000
Chunk 17: 2,500 rows, sparsity: 0.832, total: 42,500
Chunk 18: 2,500 rows, sparsity: 0.831, total: 45,000
Chunk 19: 2,500 rows, sparsity: 0.827, total: 47,500
Chunk 20: 2,500 rows, sparsity: 0.827, total: 50,000
  Memory check at chunk 20: 97.19 GB
Chunk 21: 2,500 rows, sparsity: 0.827, total: 52,500
Chunk 22: 2,500 rows, sparsity: 0.825, total: 55,000
Chunk 23: 2,500 rows, sparsity: 0.825, total: 57,500
Chunk 24: 2,500 rows, sparsity: 0.825, total: 60,000
Chunk 25: 2,500 rows, sparsity: 0.835, total: 62,500
Chunk 26: 2,500 rows, sparsity: 0.855, total: 65,000
Chunk 27: 2,500 rows, sparsity: 0.855, total: 67,500
Chunk 28: 2,500 rows, sparsity: 0.845, total: 70,000
Chunk 29: 2,500 rows, sparsity: 0.829, total: 72,500
Chunk 30: 2,500 rows, sparsity: 0.829, total: 75,000
  Memory check at chunk 30: 97.81 GB
Chunk 31: 2,500 rows, sparsity: 0.831, total: 77,500
Chunk 32: 2,500 rows, sparsity: 0.835, total: 80,000
Chunk 33: 2,500 rows, sparsity: 0.835, total: 82,500
Chunk 34: 2,500 rows, sparsity: 0.836, total: 85,000
Chunk 35: 2,500 rows, sparsity: 0.836, total: 87,500
Chunk 36: 2,500 rows, sparsity: 0.821, total: 90,000
Chunk 37: 2,500 rows, sparsity: 0.822, total: 92,500
Chunk 38: 2,500 rows, sparsity: 0.823, total: 95,000
Chunk 39: 2,500 rows, sparsity: 0.823, total: 97,500
Chunk 40: 2,500 rows, sparsity: 0.855, total: 100,000
  Memory check at chunk 40: 98.57 GB
Chunk 41: 2,500 rows, sparsity: 0.869, total: 102,500
Chunk 42: 2,500 rows, sparsity: 0.868, total: 105,000
Chunk 43: 942 rows, sparsity: 0.869, total: 105,942
Reached final chunk

Combining 43 chunks...

Conversion complete!
  Final shape: (105942, 23418)
  Average chunk sparsity: 0.836
  Final sparsity: 0.000
  Non-zero elements: 407,024,875
Saving sparse matrix to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_targets_values.npz
Saving metadata to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/train_multi_targets_metadata.npz

Output files:
  Sparse data: 824.0 MB
  Metadata: 0.9 MB
  Total: 824.8 MB
Completed HDF5→Sparse conversion: train_multi_targets.h5 - Final memory: 99.19 GB (Δ: +3.46 GB)
In [12]:
sparse_path, metadata_path = convert_h5_to_sparse_csr(
    str(FP_MULTIOME_TEST_INPUTS),
    output_name="test_multi_inputs",
)
Starting HDF5→Sparse conversion: test_multi_inputs.h5 - Initial memory: 96.15 GB
Could not determine optimal chunk size: 'FrameFixed' object has no attribute 'ncols'
Starting processing with chunk size: 2,500
Dataset columns: 228,942
Chunk 1: 2,500 rows, sparsity: 0.978, total: 2,500
Chunk 2: 2,500 rows, sparsity: 0.978, total: 5,000
Chunk 3: 2,500 rows, sparsity: 0.978, total: 7,500
Chunk 4: 2,500 rows, sparsity: 0.974, total: 10,000
Chunk 5: 2,500 rows, sparsity: 0.968, total: 12,500
Chunk 6: 2,500 rows, sparsity: 0.967, total: 15,000
Chunk 7: 2,500 rows, sparsity: 0.970, total: 17,500
Chunk 8: 2,500 rows, sparsity: 0.978, total: 20,000
Chunk 9: 2,500 rows, sparsity: 0.978, total: 22,500
Chunk 10: 2,500 rows, sparsity: 0.976, total: 25,000
  Memory check at chunk 10: 97.48 GB
Chunk 11: 2,500 rows, sparsity: 0.971, total: 27,500
Chunk 12: 2,500 rows, sparsity: 0.972, total: 30,000
Chunk 13: 2,500 rows, sparsity: 0.972, total: 32,500
Chunk 14: 2,500 rows, sparsity: 0.971, total: 35,000
Chunk 15: 2,500 rows, sparsity: 0.971, total: 37,500
Chunk 16: 2,500 rows, sparsity: 0.971, total: 40,000
Chunk 17: 2,500 rows, sparsity: 0.969, total: 42,500
Chunk 18: 2,500 rows, sparsity: 0.970, total: 45,000
Chunk 19: 2,500 rows, sparsity: 0.970, total: 47,500
Chunk 20: 2,500 rows, sparsity: 0.970, total: 50,000
  Memory check at chunk 20: 99.02 GB
Chunk 21: 2,500 rows, sparsity: 0.971, total: 52,500
Chunk 22: 2,500 rows, sparsity: 0.970, total: 55,000
Chunk 23: 935 rows, sparsity: 0.970, total: 55,935
Reached final chunk

Combining 23 chunks...

Conversion complete!
  Final shape: (55935, 228942)
  Average chunk sparsity: 0.972
  Final sparsity: 0.000
  Non-zero elements: 353,572,710
Saving sparse matrix to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/test_multi_inputs_values.npz
Saving metadata to: /share/crsp/lab/pkaiser/ddlin/single-cell-multimodal-ml/data/processed/test_multi_inputs_metadata.npz

Output files:
  Sparse data: 1642.1 MB
  Metadata: 2.3 MB
  Total: 1644.4 MB
Completed HDF5→Sparse conversion: test_multi_inputs.h5 - Final memory: 76.91 GB (Δ: -19.24 GB)

Dimension reduction with Truncated SVD¶

  • We will use the sparce matrices that we just created
In [9]:
sparse_matrix = scipy.sparse.load_npz(PROCESSED_DIR.joinpath("train_multi_inputs_values.npz"))

# Load metadata  
metadata = np.load(
    PROCESSED_DIR.joinpath("train_multi_inputs_metadata.npz"),
    allow_pickle=True
)
index = metadata['index']
columns = metadata['columns']
In [13]:
# Rows are cells and cols are genomic regions
sparse_matrix
Out[13]:
<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 607301546 stored elements and shape (105942, 228942)>
In [14]:
# OK these are the cells
index
Out[14]:
array(['56390cf1b95e', 'fc0c60183c33', '9b4a87e22ad0', ...,
       '00783f28b463', 'e7abb1a0f251', '193992d571a5'], dtype=object)
In [ ]:
# These are genomic regions
columns
Out[ ]:
array(['GL000194.1:114519-115365', 'GL000194.1:55758-56597',
       'GL000194.1:58217-58957', ..., 'chrY:7836768-7837671',
       'chrY:7869454-7870371', 'chrY:7873814-7874709'], dtype=object)
In [18]:
RUN_SVD = True  # Set to False if you want to load precomputed

# Dimension of SVD
n_components = 512

# Paths for saving/loading the reduced arrays and SVD objects (using "embeddings" and "model" for clarity)
TRAIN_INPUTS_EMBEDDINGS_PKL = PROCESSED_DIR / "train_multi_inputs_embeddings_512.pkl"
TEST_INPUTS_EMBEDDINGS_PKL = PROCESSED_DIR / "test_multi_inputs_embeddings_512.pkl"
TRAIN_TARGETS_EMBEDDINGS_PKL = PROCESSED_DIR / "train_multi_targets_embeddings_512.pkl"
SVD_INPUTS_MODEL_PKL = PROCESSED_DIR / "svd_inputs_model_512.pkl"
SVD_TARGETS_MODEL_PKL = PROCESSED_DIR / "svd_targets_model_512.pkl"

if RUN_SVD:
    print("Computing SVD embeddings...")

    # Process inputs first (load one at a time to save memory)
    
    # Load train inputs sparse matrix
    train_multi_inputs = scipy.sparse.load_npz(PROCESSED_DIR / "train_multi_inputs_values.npz")
    
    # SVD for inputs: fit on train
    svd_inputs = TruncatedSVD(n_components=n_components, random_state=42)
    X = svd_inputs.fit_transform(train_multi_inputs)
    
    # Save train embeddings and model so far
    with open(TRAIN_INPUTS_EMBEDDINGS_PKL, "wb") as f:
        pickle.dump(X, f)
    with open(SVD_INPUTS_MODEL_PKL, "wb") as f:
        pickle.dump(svd_inputs, f)
    
    # Cleanup train inputs
    del train_multi_inputs, X
    gc.collect()
    
    # Load test inputs sparse matrix
    test_multi_inputs = scipy.sparse.load_npz(PROCESSED_DIR / "test_multi_inputs_values.npz")
    
    # Transform test
    Xt = svd_inputs.transform(test_multi_inputs)
    
    # Save test embeddings
    with open(TEST_INPUTS_EMBEDDINGS_PKL, "wb") as f:
        pickle.dump(Xt, f)
    
    # Cleanup test inputs
    del test_multi_inputs, Xt
    gc.collect()
    
    # Now process targets (only train targets available)
    
    # Load train targets sparse matrix
    train_multi_targets = scipy.sparse.load_npz(PROCESSED_DIR / "train_multi_targets_values.npz")
    
    # SVD for targets: fit on train
    svd_targets = TruncatedSVD(n_components=n_components, random_state=42)
    Y = svd_targets.fit_transform(train_multi_targets)
    
    # Save targets embeddings and model
    with open(TRAIN_TARGETS_EMBEDDINGS_PKL, "wb") as f:
        pickle.dump(Y, f)
    with open(SVD_TARGETS_MODEL_PKL, "wb") as f:
        pickle.dump(svd_targets, f)
    
    # Cleanup
    del train_multi_targets, Y
    gc.collect()

    print("Computed and saved SVD embeddings (512 PCs).")

else:
    # Load precomputed reduced arrays
    with open(TRAIN_INPUTS_EMBEDDINGS_PKL, "rb") as f:
        X = pickle.load(f)
    with open(TEST_INPUTS_EMBEDDINGS_PKL, "rb") as f:
        Xt = pickle.load(f)
    with open(TRAIN_TARGETS_EMBEDDINGS_PKL, "rb") as f:
        Y = pickle.load(f)

    # Load precomputed SVD objects
    with open(SVD_INPUTS_MODEL_PKL, "rb") as f:
        svd_inputs = pickle.load(f)
    with open(SVD_TARGETS_MODEL_PKL, "rb") as f:
        svd_targets = pickle.load(f)

    print(f"Loaded precomputed SVD embeddings X (512 PCs): {X.shape}, Xt: {Xt.shape}, Y: {Y.shape}")
Computing SVD embeddings...
Computed and saved SVD embeddings (512 PCs).

Greate Parquet files¶

  • Parquet is a more efficient format than csv
  • After conversion, simply update your data loading calls from pd.read_csv('filename.csv') to pd.read_parquet('filename.parquet') when reading into DataFrames.
In [11]:
METADATA = RAW_DIR.joinpath("metadata.csv")
EVAL = RAW_DIR.joinpath("evaluation_ids.csv")
SUBMIT = RAW_DIR.joinpath("sample_submission.csv")


def convert_to_parquet(path_to_csv, out_filename):
    df = pd.read_csv(path_to_csv)
    df.to_parquet(PROCESSED_DIR.joinpath(f"{out_filename}.parquet"))
In [12]:
convert_to_parquet(METADATA, "metadata")
In [13]:
convert_to_parquet(EVAL, "evaluation")
In [14]:
convert_to_parquet(SUBMIT, "sample_submission")
In [ ]: