#!/usr/bin/env python3 """ Example script for loading the African CP Synthetic Dataset from Hugging Face Usage: python load_dataset.py """ import pandas as pd from datasets import load_dataset # Method 1: Load from Hugging Face Hub (after upload) def load_from_hub(split='train_5k'): """ Load dataset from Hugging Face Hub Args: split: Which split to load - 'train_1k', 'train_5k', 'train_10k' (training sets) - 'balanced' (50/50 CP/non-CP) - 'preterm' (high-risk population) - 'cp_only' (CP cases only) - 'test' (hold-out validation set) """ # Replace [your-username] with actual Hugging Face username dataset = load_dataset( 'csv', data_files={ 'train_1k': 'africa_cp_train_1000.csv', 'train_5k': 'africa_cp_train_5000.csv', 'train_10k': 'africa_cp_train_10000.csv', 'balanced': 'africa_cp_balanced_1000.csv', 'preterm': 'africa_cp_preterm_2000.csv', 'cp_only': 'africa_cp_cases_only_500.csv', 'test': 'africa_cp_test_2000.csv' } ) return dataset[split] # Method 2: Load directly as pandas DataFrame def load_as_dataframe(filename='africa_cp_train_5000.csv'): """ Load dataset directly as pandas DataFrame Args: filename: CSV filename to load """ df = pd.read_csv(filename) return df # Method 3: Load with custom preprocessing def load_preprocessed(split='train_5k'): """ Load dataset with basic preprocessing applied """ # Load data if split == 'train_1k': df = pd.read_csv('africa_cp_train_1000.csv') elif split == 'train_5k': df = pd.read_csv('africa_cp_train_5000.csv') elif split == 'train_10k': df = pd.read_csv('africa_cp_train_10000.csv') elif split == 'balanced': df = pd.read_csv('africa_cp_balanced_1000.csv') elif split == 'preterm': df = pd.read_csv('africa_cp_preterm_2000.csv') elif split == 'cp_only': df = pd.read_csv('africa_cp_cases_only_500.csv') elif split == 'test': df = pd.read_csv('africa_cp_test_2000.csv') else: raise ValueError(f"Unknown split: {split}") # Select feature columns feature_cols = [ 'gestational_age', 'birth_weight', 'is_sga', 'birth_asphyxia', 'neonatal_seizures', 'hyperbilirubinemia', 'neonatal_infection', 'maternal_infection', 'preclampsia', 'malaria_with_seizures', 'tuberculous_meningitis', 'head_control_age', 'sitting_age', 'epilepsy', 'feeding_difficulties', 'visual_impairment', 'hearing_impairment', 'speech_impairment', 'intellectual_disability' ] # Handle missing values (milestones not achieved) X = df[feature_cols].fillna(999) y = df['has_cp'] return X, y # Example usage if __name__ == '__main__': print("Loading African CP Synthetic Dataset...") print("="*60) # Example 1: Load as DataFrame print("\n1. Loading training set (5K) as DataFrame...") df = load_as_dataframe('africa_cp_train_5000.csv') print(f" Shape: {df.shape}") print(f" CP cases: {df['has_cp'].sum()} ({df['has_cp'].mean()*100:.1f}%)") print(f" Features: {df.columns.tolist()[:5]}...") # Example 2: Load preprocessed print("\n2. Loading preprocessed training set...") X, y = load_preprocessed('train_5k') print(f" X shape: {X.shape}") print(f" y shape: {y.shape}") print(f" Class distribution: {y.value_counts().to_dict()}") # Example 3: Load test set print("\n3. Loading test set...") test_df = load_as_dataframe('africa_cp_test_2000.csv') print(f" Test samples: {len(test_df)}") print(f" Test CP cases: {test_df['has_cp'].sum()}") # Example 4: Show dataset statistics print("\n4. Dataset Statistics:") print(f" Average gestational age: {df['gestational_age'].mean():.1f} weeks") print(f" Preterm births: {(df['gestational_age'] < 37).sum()} ({(df['gestational_age'] < 37).mean()*100:.1f}%)") print(f" Birth asphyxia: {df['birth_asphyxia'].sum()} ({df['birth_asphyxia'].mean()*100:.1f}%)") cp_cases = df[df['has_cp']] print(f"\n5. CP Cases Breakdown:") print(f" Spastic: {(cp_cases['cp_type'] == 'spastic').sum()}") print(f" GMFCS distribution:") for level in range(1, 6): count = (cp_cases['gmfcs_level'] == level).sum() print(f" Level {level}: {count}") print("\n" + "="*60) print("Dataset loaded successfully!") print("\nNext steps:") print(" 1. Explore the data: df.describe(), df.info()") print(" 2. Build models: See README.md for training protocols") print(" 3. Evaluate: Use africa_cp_test_2000.csv for final validation")