Comorbidity distributor

`ComorbidityDistributor`

High-performance distributor to assign comorbidities to people based on their age, sex, ethnicity, and region. Uses vectorized operations and batch processing for UK-scale simulations.

Source code in june/distributors/comorbidity_distributor.py

class ComorbidityDistributor:
    """High-performance distributor to assign comorbidities to people based on their age, sex, ethnicity,
    and region. Uses vectorized operations and batch processing for UK-scale simulations.

    """

    def __init__(self):
        """
        Initialize the OptimizedComorbidityDistributor with data path and configuration.
        """
        self.script_dir = Path(__file__).parent.parent.parent  # Go up to june-measles root
        self.data_path = self.script_dir / "data" / "input" / "demography" / "comorbidities" / "processed_counts_midpoint_rounded_proportions_n.csv"

        # Optimized data structures
        self._comorbidity_data = None
        self._data_loaded = False
        self._demographics_cache = {}
        self._probability_arrays = {}

        # Define condition columns from the CSV
        self.condition_columns = [
            'has_had_cvd_diagnosis_count_midpoint_rounded',
            'has_had_crd_diagnosis_count_midpoint_rounded', 
            'has_had_ckd_diagnosis_count_midpoint_rounded',
            'has_had_cld_diagnosis_count_midpoint_rounded',
            'severe_obesity_count_midpoint_rounded',
            'has_had_cancer_diagnosis_count_midpoint_rounded',
            'has_had_immunosuppression_diagnosis_count_midpoint_rounded',
            'has_had_neuro_diagnosis_count_midpoint_rounded'
        ]

        # Map condition columns to readable names
        self.condition_names = {
            'has_had_cvd_diagnosis_count_midpoint_rounded': 'cardiovascular_disease',
            'has_had_crd_diagnosis_count_midpoint_rounded': 'chronic_respiratory_disease', 
            'has_had_ckd_diagnosis_count_midpoint_rounded': 'chronic_kidney_disease',
            'has_had_cld_diagnosis_count_midpoint_rounded': 'chronic_liver_disease',
            'severe_obesity_count_midpoint_rounded': 'severe_obesity',
            'has_had_cancer_diagnosis_count_midpoint_rounded': 'cancer',
            'has_had_immunosuppression_diagnosis_count_midpoint_rounded': 'immunosuppression',
            'has_had_neuro_diagnosis_count_midpoint_rounded': 'neurological_condition'
        }

    def _load_comorbidity_data(self):
        """Load and optimize the comorbidity prevalence data from CSV file."""
        if self._data_loaded:
            return

        logger.info("Loading and optimizing comorbidity prevalence data...")

        try:
            if not self.data_path.exists():
                raise FileNotFoundError(f"Comorbidity data file not found at {self.data_path}")

            # Load CSV with optimized dtypes
            dtype_dict = {
                'sex': 'category',
                'age_band_min': 'category', 
                'combined_ethnicity_less': 'category',
                'region': 'category'
            }

            self._comorbidity_data = pd.read_csv(self.data_path, dtype=dtype_dict)
            logger.info(f"Loaded comorbidity data: {len(self._comorbidity_data)} demographic groups")

            # Create optimized multi-index lookup
            self._comorbidity_data.set_index(['sex', 'age_band_min', 'combined_ethnicity_less', 'region'], 
                                           inplace=True)

            # Pre-compute probability matrices for vectorized operations
            self._precompute_probability_matrices()

            self._data_loaded = True
            logger.info("Comorbidity data optimization completed")

        except Exception as e:
            logger.error(f"Error loading comorbidity data: {e}")
            self._data_loaded = False
            raise

    def _precompute_probability_matrices(self):
        """Pre-compute probability matrices for faster vectorized lookups."""
        logger.info("Pre-computing probability matrices...")

        # Create fast lookup arrays for each demographic combination
        for idx, row in self._comorbidity_data.iterrows():
            key = idx  # Multi-index tuple

            # Store probabilities as numpy array for fast access
            probs = np.array([row[col] for col in self.condition_columns])
            has_any_prob = row.get('has_comorbidity_midpoint_rounded', 0.0)
            multiple_prob = row.get('multiple_morbidities_count_midpoint_rounded', 0.0)

            self._probability_arrays[key] = {
                'condition_probs': probs,
                'has_any_prob': has_any_prob,
                'multiple_prob': multiple_prob,
                'condition_names': list(self.condition_names.values())
            }

    def _get_person_demographics(self, person: Person) -> Tuple[str, str, str, str]:
        """Extract and cache person demographics in optimized format.

        Args:
            person (Person): The person object

        Returns:
            Tuple[str, str, str, str]: (sex, age_band, ethnicity, region) tuple

        """
        # Convert sex to CSV format
        sex = "female" if person.sex == "f" else "male"

        # Get age band
        age = person.age
        if age < 10:
            age_band = "0-9"
        elif age < 18:
            age_band = "10-17"
        elif age < 30:
            age_band = "18-29"
        elif age < 50:
            age_band = "30-49"
        elif age < 75:
            age_band = "50-74"
        else:
            age_band = "75-99"

        # Get ethnicity and map to CSV format
        ethnicity = getattr(person, 'ethnicity', 'W')
        if ethnicity == 'O':  # Other -> Combined Other
            ethnicity = 'CO'

        # Get region with mapping
        region = self._get_person_region_fast(person)

        return sex, age_band, ethnicity, region

    def _get_person_region_fast(self, person: Person) -> str:
        """Fast region extraction with caching.

        Args:
            person (Person): The person object

        Returns:
            str: The region name with mappings applied

        """
        # Use caching for region lookups
        person_id = getattr(person, 'id', None)
        if person_id and person_id in self._demographics_cache:
            return self._demographics_cache[person_id]['region']

        region_name = None

        # Try to get region from person's household or area
        if hasattr(person, 'household') and person.household:
            if hasattr(person.household, 'area') and person.household.area:
                if hasattr(person.household.area, 'region'):
                    region_name = person.household.area.region.name

        # Try alternative attributes
        if not region_name and hasattr(person, 'area') and person.area:
            if hasattr(person.area, 'region'):
                region_name = person.area.region.name

        # Apply regional mappings
        if region_name:
            region_lower = region_name.lower()
            # Map non-English regions to English regions
            if region_lower == 'scotland':
                region_name = 'North East'
            elif region_lower == 'wales':
                region_name = 'North West'
            elif region_lower in ['northern ireland', 'northernireland']:
                region_name = 'North West'
            else:
                # Map full UK region names to CSV shorthand
                region_mappings = {
                    'east of england': 'East',
                    'east midlands': 'East Midlands',
                    'greater london': 'London',
                    'london': 'London',
                    'north east': 'North East',
                    'north west': 'North West', 
                    'south east': 'South East',
                    'south west': 'South West',
                    'west midlands': 'West Midlands',
                    'yorkshire and the humber': 'Yorkshire and The Humber',
                    'yorkshire and humber': 'Yorkshire and The Humber'
                }

                mapped_region = region_mappings.get(region_lower)
                if mapped_region:
                    region_name = mapped_region

        return region_name or 'London'  # Default fallback

    def _group_people_by_demographics(self, people: List[Person]) -> Dict[Tuple, List[Person]]:
        """Group people by their demographic characteristics for batch processing.

        Args:
            people (List[Person]): List of people to group

        Returns:
            Dict[Tuple, List[Person]]: Dictionary mapping demographic tuples to lists of people

        """
        groups = defaultdict(list)

        for person in people:
            demographics = self._get_person_demographics(person)
            groups[demographics].append(person)

        return dict(groups)

    def _assign_comorbidities_to_group(self, people: List[Person], demographics: Tuple) -> None:
        """Assign comorbidities to a group of people with same demographics using vectorized operations.

        Args:
            people (List[Person]): List of people with same demographics
            demographics (Tuple): (sex, age_band, ethnicity, region) tuple

        """
        if not people:
            return

        n_people = len(people)
        sex, age_band, ethnicity, region = demographics

        # Try exact match first
        if demographics in self._probability_arrays:
            prob_data = self._probability_arrays[demographics]
        else:
            # Find fallback data efficiently
            prob_data = self._find_fallback_data_optimized(demographics)

        if not prob_data:
            # Use age-based defaults
            prob_data = self._get_default_probabilities_optimized(age_band)

        # Vectorized probability calculations
        condition_probs = prob_data['condition_probs']
        has_any_prob = prob_data['has_any_prob']
        multiple_prob = prob_data['multiple_prob']
        condition_names = prob_data['condition_names']

        # Vectorized random sampling
        has_any_conditions = np.random.random(n_people) < has_any_prob
        has_multiple = np.random.random(n_people) < multiple_prob

        # For people who should have conditions, sample which ones
        for i, person in enumerate(people):
            conditions = set()

            if has_any_conditions[i]:
                # Sample individual conditions
                condition_mask = np.random.random(len(condition_probs)) < condition_probs
                candidate_conditions = [condition_names[j] for j, has_condition in enumerate(condition_mask) if has_condition]

                if candidate_conditions:
                    if has_multiple[i] and len(candidate_conditions) > 1:
                        # Keep multiple conditions
                        conditions.update(candidate_conditions)
                    else:
                        # Keep only one condition
                        conditions.add(np.random.choice(candidate_conditions))
                elif has_multiple[i]:
                    # Force assign at least one condition if they should have multiple
                    # Choose based on probabilities
                    if condition_probs.sum() > 0:
                        normalized_probs = condition_probs / condition_probs.sum()
                        chosen_idx = np.random.choice(len(condition_names), p=normalized_probs)
                        conditions.add(condition_names[chosen_idx])

            # Store conditions
            person.comorbidity = conditions

    def _find_fallback_data_optimized(self, demographics: Tuple) -> Optional[Dict]:
        """Optimized fallback data lookup using pandas operations.

        Args:
            demographics (Tuple): (sex, age_band, ethnicity, region) tuple

        Returns:
            Dict or None: Fallback probability data

        """
        sex, age_band, ethnicity, region = demographics

        # Try different fallback strategies efficiently using pandas indexing
        fallback_strategies = [
            # Same demographics, different region  
            [(sex, age_band, ethnicity), 3],  # Don't filter region
            # Same sex/age, White ethnicity, same region
            [(sex, age_band, 'W', region), None],
            # Same sex/age, White ethnicity, any region
            [(sex, age_band, 'W'), 3],
            # Same age, any sex/ethnicity, same region
            [(None, age_band, None, region), [0, 2]],  # Don't filter sex and ethnicity
            # Same age, any sex/ethnicity/region
            [(None, age_band, None), [0, 2, 3]]  # Only filter age
        ]

        for strategy_tuple, skip_levels in fallback_strategies:
            try:
                if skip_levels is None:
                    # Exact match attempt
                    if strategy_tuple in self._comorbidity_data.index:
                        row = self._comorbidity_data.loc[strategy_tuple].iloc[0] if isinstance(
                            self._comorbidity_data.loc[strategy_tuple], pd.DataFrame) else self._comorbidity_data.loc[strategy_tuple]
                        return self._row_to_prob_data(row)
                else:
                    # Partial match - find any matching record
                    data_subset = self._comorbidity_data
                    for i, val in enumerate(strategy_tuple):
                        if val is not None and i not in (skip_levels if isinstance(skip_levels, list) else [skip_levels]):
                            level_name = data_subset.index.names[i]
                            data_subset = data_subset[data_subset.index.get_level_values(level_name) == val]

                    if not data_subset.empty:
                        row = data_subset.iloc[0]
                        logger.info(f"Using fallback data for {demographics}")
                        return self._row_to_prob_data(row)

            except (KeyError, IndexError):
                continue

        return None

    def _row_to_prob_data(self, row) -> Dict:
        """Convert DataFrame row to probability data format.

        Args:
            row (pandas.Series): DataFrame row

        Returns:
            Dict: Probability data dictionary

        """
        condition_probs = np.array([row[col] for col in self.condition_columns])
        has_any_prob = row.get('has_comorbidity_midpoint_rounded', 0.0)
        multiple_prob = row.get('multiple_morbidities_count_midpoint_rounded', 0.0)

        return {
            'condition_probs': condition_probs,
            'has_any_prob': has_any_prob,
            'multiple_prob': multiple_prob,
            'condition_names': list(self.condition_names.values())
        }

    def _get_default_probabilities_optimized(self, age_band: str) -> Dict:
        """Get optimized default comorbidity probabilities when no data is available.

        Args:
            age_band (str): Age band string

        Returns:
            Dict: Default probability data

        """
        # Extract numeric age from age band
        age = int(age_band.split('-')[0])

        # Age-based default probabilities
        if age < 18:
            base_prob = 0.01
        elif age < 30:
            base_prob = 0.05
        elif age < 50:
            base_prob = 0.15
        elif age < 75:
            base_prob = 0.30
        else:
            base_prob = 0.50

        condition_probs = np.full(len(self.condition_columns), base_prob * 0.1)

        return {
            'condition_probs': condition_probs,
            'has_any_prob': base_prob,
            'multiple_prob': base_prob * 0.1,
            'condition_names': list(self.condition_names.values())
        }

    def assign_comorbidities_to_all_residents(self, world) -> None:
        """Optimized assignment of comorbidities to all residents in the world using batch processing.

        Args:
            world (World): The world object containing all groups

        """
        logger.info("Starting comorbidity assignment for all residents...")

        # Load data if not already loaded
        if not self._data_loaded:
            self._load_comorbidity_data()

        total_people_processed = 0
        condition_counts = {name: 0 for name in self.condition_names.values()}
        multiple_conditions_count = 0

        # Collect all people for batch processing
        all_people = []

        # Process households
        if world.households is not None:
            logger.info("Collecting household residents...")
            for household in world.households:
                if household.residents:
                    all_people.extend(household.residents)

        # Process care homes
        if world.care_homes is not None:
            logger.info("Collecting care home residents...")
            for care_home in world.care_homes:
                if hasattr(care_home, 'residents') and care_home.residents:
                    all_people.extend(care_home.residents)

        # Process boarding schools
        if hasattr(world, 'boarding_schools') and world.boarding_schools is not None:
            logger.info("Collecting boarding school residents...")
            for boarding_school in world.boarding_schools:
                if hasattr(boarding_school, 'residents') and boarding_school.residents:
                    all_people.extend(boarding_school.residents)

        # Process student dorms
        if world.student_dorms is not None:
            logger.info("Collecting student dorm residents...")
            for student_dorm in world.student_dorms:
                if hasattr(student_dorm, 'residents') and student_dorm.residents:
                    all_people.extend(student_dorm.residents)

        logger.info(f"Collected {len(all_people):,} people for batch processing")

        # Group people by demographics for batch processing
        logger.info("Grouping people by demographics...")
        demographic_groups = self._group_people_by_demographics(all_people)
        logger.info(f"Created {len(demographic_groups):,} demographic groups")

        # Process each demographic group in batch
        for i, (demographics, people_group) in enumerate(demographic_groups.items()):
            if i % 100 == 0 and i > 0:
                logger.info(f"Processed {i:,}/{len(demographic_groups):,} demographic groups")

            self._assign_comorbidities_to_group(people_group, demographics)
            total_people_processed += len(people_group)

            # Update statistics
            for person in people_group:
                conditions = getattr(person, 'comorbidity', set())
                for condition in conditions:
                    if condition in condition_counts:
                        condition_counts[condition] += 1
                if len(conditions) > 1:
                    multiple_conditions_count += 1

        logger.info(f"Completed optimized comorbidity assignment: {total_people_processed:,} people processed")

        # Display statistics
        self._display_assignment_statistics(total_people_processed, condition_counts, multiple_conditions_count)

    def _display_assignment_statistics(self, total_people: int, condition_counts: Dict[str, int], multiple_count: int) -> None:
        """Display statistics about comorbidity assignments.

        Args:
            total_people (int): Total number of people processed
            condition_counts (Dict[str, int]): Count of each condition assigned
            multiple_count (int): Number of people with multiple conditions

        """
        print("\n===== Comorbidity Assignment Statistics =====")
        print(f"Total people processed: {total_people:,}")

        people_with_any_condition = len([count for count in condition_counts.values() if count > 0])
        total_conditions_assigned = sum(condition_counts.values())

        print(f"People with any comorbidity: {total_conditions_assigned - multiple_count:,} ({(total_conditions_assigned - multiple_count)/total_people*100:.1f}%)")
        print(f"People with multiple comorbidities: {multiple_count:,} ({multiple_count/total_people*100:.1f}%)")

        print("\nCondition prevalence:")
        for condition, count in sorted(condition_counts.items()):
            if count > 0:
                percentage = count / total_people * 100
                print(f"  {condition.replace('_', ' ').title()}: {count:,} ({percentage:.2f}%)")

        print("="*60)

`init()`

Initialize the OptimizedComorbidityDistributor with data path and configuration.

Source code in june/distributors/comorbidity_distributor.py

def __init__(self):
    """
    Initialize the OptimizedComorbidityDistributor with data path and configuration.
    """
    self.script_dir = Path(__file__).parent.parent.parent  # Go up to june-measles root
    self.data_path = self.script_dir / "data" / "input" / "demography" / "comorbidities" / "processed_counts_midpoint_rounded_proportions_n.csv"

    # Optimized data structures
    self._comorbidity_data = None
    self._data_loaded = False
    self._demographics_cache = {}
    self._probability_arrays = {}

    # Define condition columns from the CSV
    self.condition_columns = [
        'has_had_cvd_diagnosis_count_midpoint_rounded',
        'has_had_crd_diagnosis_count_midpoint_rounded', 
        'has_had_ckd_diagnosis_count_midpoint_rounded',
        'has_had_cld_diagnosis_count_midpoint_rounded',
        'severe_obesity_count_midpoint_rounded',
        'has_had_cancer_diagnosis_count_midpoint_rounded',
        'has_had_immunosuppression_diagnosis_count_midpoint_rounded',
        'has_had_neuro_diagnosis_count_midpoint_rounded'
    ]

    # Map condition columns to readable names
    self.condition_names = {
        'has_had_cvd_diagnosis_count_midpoint_rounded': 'cardiovascular_disease',
        'has_had_crd_diagnosis_count_midpoint_rounded': 'chronic_respiratory_disease', 
        'has_had_ckd_diagnosis_count_midpoint_rounded': 'chronic_kidney_disease',
        'has_had_cld_diagnosis_count_midpoint_rounded': 'chronic_liver_disease',
        'severe_obesity_count_midpoint_rounded': 'severe_obesity',
        'has_had_cancer_diagnosis_count_midpoint_rounded': 'cancer',
        'has_had_immunosuppression_diagnosis_count_midpoint_rounded': 'immunosuppression',
        'has_had_neuro_diagnosis_count_midpoint_rounded': 'neurological_condition'
    }

`assign_comorbidities_to_all_residents(world)`

Optimized assignment of comorbidities to all residents in the world using batch processing.

Parameters:

Name	Type	Description	Default
`world`	`World`	The world object containing all groups	required

Source code in june/distributors/comorbidity_distributor.py

def assign_comorbidities_to_all_residents(self, world) -> None:
    """Optimized assignment of comorbidities to all residents in the world using batch processing.

    Args:
        world (World): The world object containing all groups

    """
    logger.info("Starting comorbidity assignment for all residents...")

    # Load data if not already loaded
    if not self._data_loaded:
        self._load_comorbidity_data()

    total_people_processed = 0
    condition_counts = {name: 0 for name in self.condition_names.values()}
    multiple_conditions_count = 0

    # Collect all people for batch processing
    all_people = []

    # Process households
    if world.households is not None:
        logger.info("Collecting household residents...")
        for household in world.households:
            if household.residents:
                all_people.extend(household.residents)

    # Process care homes
    if world.care_homes is not None:
        logger.info("Collecting care home residents...")
        for care_home in world.care_homes:
            if hasattr(care_home, 'residents') and care_home.residents:
                all_people.extend(care_home.residents)

    # Process boarding schools
    if hasattr(world, 'boarding_schools') and world.boarding_schools is not None:
        logger.info("Collecting boarding school residents...")
        for boarding_school in world.boarding_schools:
            if hasattr(boarding_school, 'residents') and boarding_school.residents:
                all_people.extend(boarding_school.residents)

    # Process student dorms
    if world.student_dorms is not None:
        logger.info("Collecting student dorm residents...")
        for student_dorm in world.student_dorms:
            if hasattr(student_dorm, 'residents') and student_dorm.residents:
                all_people.extend(student_dorm.residents)

    logger.info(f"Collected {len(all_people):,} people for batch processing")

    # Group people by demographics for batch processing
    logger.info("Grouping people by demographics...")
    demographic_groups = self._group_people_by_demographics(all_people)
    logger.info(f"Created {len(demographic_groups):,} demographic groups")

    # Process each demographic group in batch
    for i, (demographics, people_group) in enumerate(demographic_groups.items()):
        if i % 100 == 0 and i > 0:
            logger.info(f"Processed {i:,}/{len(demographic_groups):,} demographic groups")

        self._assign_comorbidities_to_group(people_group, demographics)
        total_people_processed += len(people_group)

        # Update statistics
        for person in people_group:
            conditions = getattr(person, 'comorbidity', set())
            for condition in conditions:
                if condition in condition_counts:
                    condition_counts[condition] += 1
            if len(conditions) > 1:
                multiple_conditions_count += 1

    logger.info(f"Completed optimized comorbidity assignment: {total_people_processed:,} people processed")

    # Display statistics
    self._display_assignment_statistics(total_people_processed, condition_counts, multiple_conditions_count)

Comorbidity distributor