#!/usr/bin/env python3 """ Cerebral Palsy Synthetic Data Generator Literature-based probabilistic generation with 2024 research data Based on: - Nigerian study (2020): 70% spastic, GMFCS distribution - Norwegian cohort (2014): Gestational age risk curves - Ghana CP register (2024): 78% preterm in CP cases - African systematic reviews: Birth asphyxia 47.6%, kernicterus 23.8% - Slovenian case-control: SGA odds ratio 2.43 """ import random import math import csv from datetime import datetime from typing import Dict, Optional, List class CPDataGenerator: """Generate synthetic cerebral palsy datasets based on literature""" def __init__(self, african_context: bool = True): """ Initialize the generator Args: african_context: If True, uses African population statistics (higher risk factors) """ self.african_context = african_context random.seed() # Can set a specific seed for reproducibility def normal_random(self, mean: float, std_dev: float) -> float: """Generate random number from normal distribution using Box-Muller transform""" u1 = random.random() u2 = random.random() z0 = math.sqrt(-2 * math.log(u1)) * math.cos(2 * math.pi * u2) return mean + z0 * std_dev def generate_sample(self, sample_id: int) -> Dict: """ Generate a single synthetic sample Args: sample_id: Unique identifier for this sample Returns: Dictionary containing all features for one patient """ # GESTATIONAL AGE - Bimodal distribution for African context is_preterm_birth = random.random() < (0.19 if self.african_context else 0.11) gestational_age = ( max(24, self.normal_random(32, 3.5)) if is_preterm_birth # Preterm else self.normal_random(39, 1.3) # Term ) # BIRTH WEIGHT - Conditional on gestational age birth_weight = ( max(0.5, self.normal_random(2.1, 0.6)) if gestational_age < 37 # Preterm else self.normal_random(3.2, 0.45) # Term ) is_very_low_birth_weight = birth_weight < 1.5 is_low_birth_weight = birth_weight < 2.5 is_sga = birth_weight < (1.8 if gestational_age < 37 else 2.5) # PERINATAL RISK FACTORS - African context has higher rates birth_asphyxia = random.random() < (0.12 if self.african_context else 0.05) neonatal_seizures = random.random() < 0.06 hyperbilirubinemia = random.random() < (0.15 if self.african_context else 0.08) neonatal_infection = random.random() < (0.10 if self.african_context else 0.04) maternal_infection = random.random() < 0.12 preclampsia = random.random() < 0.08 # CALCULATE CP PROBABILITY - Based on literature odds ratios cp_probability = 0.0025 # Base: 2.5 per 1000 # Gestational age effects (from Norwegian study) if gestational_age < 28: cp_probability += 0.085 # 8.5% absolute risk elif gestational_age < 31: cp_probability += 0.056 # 5.6% absolute risk elif gestational_age < 34: cp_probability += 0.020 # 2.0% absolute risk elif gestational_age < 37: cp_probability += 0.004 # 0.4% absolute risk # Birth weight effects if is_very_low_birth_weight: cp_probability += 0.08 # 5-15% develop CP elif is_low_birth_weight: cp_probability += 0.03 # SGA effect (OR 2.43) if is_sga: cp_probability *= 2.0 # Perinatal complications (African context prioritized) if birth_asphyxia: cp_probability += 0.20 if self.african_context else 0.15 if neonatal_seizures: cp_probability += 0.25 if hyperbilirubinemia: cp_probability += 0.12 if self.african_context else 0.05 if neonatal_infection: cp_probability += 0.15 if self.african_context else 0.08 if maternal_infection: cp_probability += 0.06 if preclampsia: cp_probability *= 0.6 # Protective effect cp_probability = min(cp_probability, 0.90) # Cap at 90% has_cp = random.random() < cp_probability # CP-SPECIFIC FEATURES cp_type = None cp_subtype = None gmfcs_level = None tone_abnormality = None if has_cp: # CP TYPE - Based on Nigerian study and literature type_rand = random.random() if type_rand < 0.70: # 70% spastic cp_type = 'spastic' subtype_rand = random.random() cp_subtype = 'bilateral' if subtype_rand < 0.60 else 'unilateral' tone_abnormality = 'hypertonia' elif type_rand < 0.798: # 9.8% ataxic cp_type = 'ataxic' cp_subtype = 'generalized' tone_abnormality = 'hypotonia' elif type_rand < 0.844: # 4.6% dystonic cp_type = 'dystonic' cp_subtype = 'generalized' tone_abnormality = 'variable' elif type_rand < 0.919: # 7.5% choreoathetoid cp_type = 'choreoathetoid' cp_subtype = 'generalized' tone_abnormality = 'variable' else: # 8.1% mixed/unclassifiable cp_type = 'mixed' cp_subtype = 'variable' tone_abnormality = 'mixed' # GMFCS LEVEL - Based on Nigerian distribution gmfcs_rand = random.random() if gmfcs_rand < 0.181: gmfcs_level = 1 # 18.1% elif gmfcs_rand < 0.583: gmfcs_level = 2 # 40.2% elif gmfcs_rand < 0.722: gmfcs_level = 3 # 13.9% elif gmfcs_rand < 0.861: gmfcs_level = 4 # 13.9% else: gmfcs_level = 5 # 13.9% # MOTOR MILESTONES if has_cp: delay_multiplier = { 1: 1.5, 2: 2.0, 3: 2.5, 4: 4.0, 5: 6.0 }[gmfcs_level] head_control_age = self.normal_random(2 * delay_multiplier, 1.5) sitting_age = self.normal_random(6 * delay_multiplier, 2.5) # GMFCS 4-5 may never achieve crawling/walking crawling_age = None if gmfcs_level >= 4 else self.normal_random(9 * delay_multiplier, 3) if gmfcs_level >= 4: walking_age = None elif gmfcs_level == 3: walking_age = self.normal_random(24, 8) else: walking_age = self.normal_random(12 * delay_multiplier, 4) else: # Typical development head_control_age = self.normal_random(2, 0.5) sitting_age = self.normal_random(6, 1.0) crawling_age = self.normal_random(9, 1.5) walking_age = self.normal_random(12, 2.0) # COMORBIDITIES - Higher rates with CP epilepsy = (random.random() < (0.40 if self.african_context else 0.35)) if has_cp else (random.random() < 0.01) feeding_difficulties = (random.random() < 0.55) if has_cp else (random.random() < 0.08) visual_impairment = (random.random() < (0.40 if self.african_context else 0.30)) if has_cp else (random.random() < 0.05) hearing_impairment = (random.random() < 0.22) if has_cp else (random.random() < 0.03) speech_impairment = (random.random() < 0.45) if has_cp else (random.random() < 0.05) intellectual_disability = (random.random() < 0.50) if has_cp else (random.random() < 0.02) # African-specific postnatal risks malaria_with_seizures = (self.african_context and has_cp and random.random() < 0.10) tuberculous_meningitis = (self.african_context and has_cp and random.random() < 0.04) return { 'id': sample_id, 'gestational_age': round(gestational_age, 1), 'birth_weight': round(birth_weight, 2), 'is_sga': is_sga, 'birth_asphyxia': birth_asphyxia, 'neonatal_seizures': neonatal_seizures, 'hyperbilirubinemia': hyperbilirubinemia, 'neonatal_infection': neonatal_infection, 'maternal_infection': maternal_infection, 'preclampsia': preclampsia, 'malaria_with_seizures': malaria_with_seizures if self.african_context else None, 'tuberculous_meningitis': tuberculous_meningitis if self.african_context else None, 'head_control_age': round(head_control_age, 1), 'sitting_age': round(sitting_age, 1), 'crawling_age': round(crawling_age, 1) if crawling_age else None, 'walking_age': round(walking_age, 1) if walking_age else None, 'epilepsy': epilepsy, 'feeding_difficulties': feeding_difficulties, 'visual_impairment': visual_impairment, 'hearing_impairment': hearing_impairment, 'speech_impairment': speech_impairment, 'intellectual_disability': intellectual_disability, 'tone_abnormality': tone_abnormality, 'has_cp': has_cp, 'cp_type': cp_type, 'cp_subtype': cp_subtype, 'gmfcs_level': gmfcs_level, 'cp_probability_score': round(cp_probability, 3) } def generate_dataset(self, num_samples: int) -> List[Dict]: """ Generate multiple samples Args: num_samples: Number of samples to generate Returns: List of sample dictionaries """ print(f"Generating {num_samples} samples...") data = [self.generate_sample(i + 1) for i in range(num_samples)] # Calculate statistics cp_cases = sum(1 for d in data if d['has_cp']) avg_ga = sum(d['gestational_age'] for d in data) / len(data) preterm_count = sum(1 for d in data if d['gestational_age'] < 37) print(f"\nDataset Statistics:") print(f" Total samples: {num_samples}") print(f" CP cases: {cp_cases} ({cp_cases/num_samples*100:.2f}%)") print(f" Avg gestational age: {avg_ga:.1f} weeks") print(f" Preterm births: {preterm_count} ({preterm_count/num_samples*100:.1f}%)") if cp_cases > 0: spastic_count = sum(1 for d in data if d['cp_type'] == 'spastic') print(f" Spastic CP: {spastic_count} ({spastic_count/cp_cases*100:.1f}%)") print(f"\n GMFCS Distribution:") for level in range(1, 6): count = sum(1 for d in data if d['gmfcs_level'] == level) print(f" Level {level}: {count} ({count/cp_cases*100:.1f}%)") return data def save_to_csv(self, data: List[Dict], filename: Optional[str] = None): """ Save dataset to CSV file Args: data: List of sample dictionaries filename: Output filename (auto-generated if None) """ if not filename: context = 'africa' if self.african_context else 'global' timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f'cp_synthetic_{context}_{timestamp}.csv' if data: with open(filename, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=data[0].keys()) writer.writeheader() writer.writerows(data) print(f"\n✓ Dataset saved to: {filename}") def main(): """Example usage""" import argparse parser = argparse.ArgumentParser( description='Generate synthetic CP dataset based on literature' ) parser.add_argument( '-n', '--num-samples', type=int, default=1000, help='Number of samples to generate (default: 1000)' ) parser.add_argument( '-c', '--context', choices=['african', 'global'], default='african', help='Population context (default: african)' ) parser.add_argument( '-o', '--output', type=str, help='Output CSV filename (auto-generated if not specified)' ) parser.add_argument( '-s', '--seed', type=int, help='Random seed for reproducibility' ) args = parser.parse_args() if args.seed is not None: random.seed(args.seed) print(f"Using random seed: {args.seed}") # Create generator generator = CPDataGenerator(african_context=(args.context == 'african')) # Generate dataset data = generator.generate_dataset(args.num_samples) # Save to CSV generator.save_to_csv(data, args.output) if __name__ == '__main__': main()