Ethnicity distributor

`EthnicityDistributor`

Distributor to assign ethnicities to people based on their household composition and area-specific demographic data. Integrates with JUNE's existing distributor pattern.

Source code in june/distributors/ethnicity_distributor.py

class EthnicityDistributor:
    """Distributor to assign ethnicities to people based on their household composition
    and area-specific demographic data. Integrates with JUNE's existing distributor pattern.

    """

    def __init__(self):
        """
        Initialize the EthnicityDistributor with configuration and data paths.
        """
        self.script_dir = Path(__file__).parent.parent.parent  # Go up to june-measles root
        self.ethnicity_dir = self.script_dir / "ethnicity_assigner"

        # Load ethnicity assignment configuration
        self._load_config()

        # Cache for loaded data to avoid repeated I/O
        self._data_cache = {}

        # Don't preload data at init - do it when we know which areas we need
        self._data_loaded = False

    def _load_config(self):
        """Load ethnicity assignment configuration"""
        import yaml

        config_path = self.ethnicity_dir / "data" / "config.yaml"
        if config_path.exists():
            with open(config_path, 'r') as f:
                config = yaml.safe_load(f)
        else:
            # Fallback default configuration
            config = {
                'age_thresholds': {
                    'child_max_age': 17,
                    'young_adult_min_age': 18,
                    'young_adult_max_age': 25,
                    'adult_min_age': 26,
                    'adult_max_age': 64,
                    'elderly_min_age': 65
                },
                'age_groups': {
                    'kids': 'kids',
                    'young_adults': 'young_adults',
                    'adults': 'adults',
                    'old_adults': 'old_adults'
                },
                'ethnicity_codes': {
                    'white': 'W',
                    'asian': 'A',
                    'black': 'B',
                    'mixed': 'M',
                    'other': 'O'
                }
            }

        # Store configuration as instance variables
        self.age_thresholds = config['age_thresholds']
        self.age_groups = config['age_groups']
        self.ethnicity_codes = config['ethnicity_codes']

    def _preload_targeted_data(self, area_codes_needed: set):
        """Preload only the ethnicity data needed for the specific areas in this world.

        Args:
            area_codes_needed (set): Set of area codes that exist in this world's households

        """
        logger.info(f"Preloading ethnicity data for {len(area_codes_needed)} areas...")

        # Initialize lookup dictionaries
        self._ethnicity_lookup_ew = {}
        self._ethnicity_lookup_scotland = {}
        self._ethnicity_lookup_ni = {}
        self._diversity_lookup_ew = {}
        self._diversity_lookup_scotland = {}
        self._diversity_lookup_ni = {}
        self._partnership_lookup = {}

        # Separate area codes by region
        ew_areas = {code for code in area_codes_needed if not (code.startswith('S') or code.startswith('N'))}
        scotland_areas = {code for code in area_codes_needed if code.startswith('S')}
        ni_areas = {code for code in area_codes_needed if code.startswith('N')}

        # Load England & Wales ethnicity data (only for needed areas)
        if ew_areas:
            ethnicity_path_ew = self.ethnicity_dir / "data" / "ethnicity_5groups_by_OA.csv"
            if ethnicity_path_ew.exists():
                df = pd.read_csv(ethnicity_path_ew)
                # Filter to only needed areas
                df_filtered = df[df['area'].isin(ew_areas)]
                for _, row in df_filtered.iterrows():
                    area_code = row['area']
                    total = row['total']
                    self._ethnicity_lookup_ew[area_code] = {
                        self.ethnicity_codes['asian']: row['A'] / total,
                        self.ethnicity_codes['black']: row['B'] / total,
                        self.ethnicity_codes['mixed']: row['M'] / total, 
                        self.ethnicity_codes['white']: row['W'] / total,
                        self.ethnicity_codes['other']: row['CO'] / total
                    }
                logger.info(f"Loaded England & Wales ethnicity data: {len(self._ethnicity_lookup_ew)} areas")

        # Load Scotland ethnicity data (only for needed areas)
        if scotland_areas:
            ethnicity_path_scotland = self.ethnicity_dir / "data" / "SCT_ethnicity_5groups_by_OA_standardized.csv"
            if ethnicity_path_scotland.exists():
                df = pd.read_csv(ethnicity_path_scotland)
                # Filter to only needed areas
                df_filtered = df[df['area'].isin(scotland_areas)]
                for _, row in df_filtered.iterrows():
                    area_code = row['area']
                    total = row['total']
                    self._ethnicity_lookup_scotland[area_code] = {
                        self.ethnicity_codes['asian']: row['A'] / total,
                        self.ethnicity_codes['black']: row['B'] / total,
                        self.ethnicity_codes['mixed']: row['M'] / total, 
                        self.ethnicity_codes['white']: row['W'] / total,
                        self.ethnicity_codes['other']: row['CO'] / total
                    }
                logger.info(f"Loaded Scotland ethnicity data: {len(self._ethnicity_lookup_scotland)} areas")

        # Load Northern Ireland ethnicity data (only for needed areas)
        if ni_areas:
            ethnicity_path_ni = self.ethnicity_dir / "data" / "NI_ethnicity_5groups_by_OA_standardized.csv"
            if ethnicity_path_ni.exists():
                df = pd.read_csv(ethnicity_path_ni)
                # Filter to only needed areas
                df_filtered = df[df['area'].isin(ni_areas)]
                for _, row in df_filtered.iterrows():
                    area_code = row['area']
                    total = row['total']
                    self._ethnicity_lookup_ni[area_code] = {
                        self.ethnicity_codes['asian']: row['A'] / total,
                        self.ethnicity_codes['black']: row['B'] / total,
                        self.ethnicity_codes['mixed']: row['M'] / total, 
                        self.ethnicity_codes['white']: row['W'] / total,
                        self.ethnicity_codes['other']: row['CO'] / total
                    }
                logger.info(f"Loaded Northern Ireland ethnicity data: {len(self._ethnicity_lookup_ni)} areas")

        # Load England & Wales diversity data (only for needed areas)
        if ew_areas:
            diversity_path_ew = self.ethnicity_dir / "data" / "combination_ethnicities_in_households_standardized.csv"
            if diversity_path_ew.exists():
                df = pd.read_csv(diversity_path_ew)
                # Filter to only needed areas
                df_filtered = df[df['area'].isin(ew_areas)]
                for _, row in df_filtered.iterrows():
                    area_code = row['area']
                    single_eth = float(row['single_ethnicity'])
                    two_eth = float(row['two_ethnicities'])
                    three_plus_eth = float(row['three_plus_ethnicities'])
                    total_households = single_eth + two_eth + three_plus_eth

                    self._diversity_lookup_ew[area_code] = {
                        'single_ethnicity': single_eth / total_households,
                        'two_ethnicities': two_eth / total_households,
                        'three_plus_ethnicities': three_plus_eth / total_households
                    }
                logger.info(f"Loaded England & Wales diversity data: {len(self._diversity_lookup_ew)} areas")

        # Load Scotland diversity data (only for needed areas)
        if scotland_areas:
            diversity_path_scotland = self.ethnicity_dir / "data" / "SCT_multiple_ethnic_groups_in_household_by_OA_standardized.csv"
            if diversity_path_scotland.exists():
                df = pd.read_csv(diversity_path_scotland)
                # Filter to only needed areas
                df_filtered = df[df['area'].isin(scotland_areas)]
                for _, row in df_filtered.iterrows():
                    area_code = row['area']
                    single_eth = float(row['single_ethnicity'])
                    two_eth = float(row['two_ethnicities'])
                    three_plus_eth = float(row['three_plus_ethnicities'])
                    total_households = single_eth + two_eth + three_plus_eth

                    self._diversity_lookup_scotland[area_code] = {
                        'single_ethnicity': single_eth / total_households,
                        'two_ethnicities': two_eth / total_households,
                        'three_plus_ethnicities': three_plus_eth / total_households
                    }
                logger.info(f"Loaded Scotland diversity data: {len(self._diversity_lookup_scotland)} areas")

        # Load Northern Ireland diversity data (only for needed areas)
        if ni_areas:
            diversity_path_ni = self.ethnicity_dir / "data" / "NI_multiple_ethnic_groups_in_household_by_OA_standardized.csv"
            if diversity_path_ni.exists():
                df = pd.read_csv(diversity_path_ni)
                # Filter to only needed areas
                df_filtered = df[df['area'].isin(ni_areas)]
                for _, row in df_filtered.iterrows():
                    area_code = row['area']
                    single_eth = float(row['single_ethnicity'])
                    two_eth = float(row['two_ethnicities'])
                    three_plus_eth = float(row['three_plus_ethnicities'])
                    total_households = single_eth + two_eth + three_plus_eth

                    self._diversity_lookup_ni[area_code] = {
                        'single_ethnicity': single_eth / total_households,
                        'two_ethnicities': two_eth / total_households,
                        'three_plus_ethnicities': three_plus_eth / total_households
                    }
                logger.info(f"Loaded Northern Ireland diversity data: {len(self._diversity_lookup_ni)} areas")

        # Load partnership data (only for needed areas)
        partnership_path = self.ethnicity_dir / "data" / "precomputed_area_partnerships.csv"
        if partnership_path.exists():
            df = pd.read_csv(partnership_path)
            # Filter to only needed areas
            df_filtered = df[df['area'].isin(area_codes_needed)]
            for _, row in df_filtered.iterrows():
                area_code = row['area']
                first_eth = row['first_ethnicity']

                if area_code not in self._partnership_lookup:
                    self._partnership_lookup[area_code] = {}

                self._partnership_lookup[area_code][first_eth] = {
                    'A': row['prob_A'],
                    'B': row['prob_B'], 
                    'M': row['prob_M'],
                    'W': row['prob_W'],
                    'O': row['prob_O']
                }
            logger.info(f"Loaded partnership data: {len(df_filtered)} partnerships for {len(self._partnership_lookup)} areas")

        logger.info(f"Ethnicity data loading completed: {len(area_codes_needed)} areas processed")

    def assign_ethnicities_to_households(self, households: Households) -> None:
        """Main method to assign ethnicities to all residents in households.

        Args:
            households (Households): The Households supergroup containing all household objects

        """
        logger.info(f"Starting ethnicity assignment for {len(households)} households")

        # First pass: collect all unique area codes that exist in this world
        if not self._data_loaded:
            logger.info("Collecting area codes from households...")
            area_codes_needed = set()
            for household in households:
                area_code = self._get_area_code(household)
                if area_code:
                    area_codes_needed.add(area_code)

            logger.info(f"Found {len(area_codes_needed)} unique areas, loading relevant data...")
            self._preload_targeted_data(area_codes_needed)
            self._data_loaded = True

        total_people_assigned = 0
        areas_processed = set()

        for i, household in enumerate(households):
            if i % 1000 == 0 and i > 0:
                logger.info(f"Processed {i:,} households, assigned ethnicities to {total_people_assigned:,} people")

            if household.residents:
                # Get area code for this household
                area_code = self._get_area_code(household)
                areas_processed.add(area_code)

                # Assign ethnicities to all residents
                assignments = self._assign_household_ethnicities(household, area_code)

                # Apply assignments to Person objects
                for person in household.residents:
                    if hasattr(person, 'id') and str(person.id) in assignments:
                        person.ethnicity = assignments[str(person.id)]
                        total_people_assigned += 1

        logger.info(f"Completed ethnicity assignment: {total_people_assigned:,} people assigned across {len(areas_processed)} unique areas")

        # Show sample of assignments
        self._display_sample_assignments(households, total_people_assigned)

    def assign_ethnicities_to_care_homes(self, care_homes) -> None:
        """Assign ethnicities to care home residents based on area demographics.

        Args:
            care_homes (CareHomes or similar supergroup): The care homes supergroup containing all care home objects

        """
        if not care_homes:
            return

        logger.info(f"Starting ethnicity assignment for care home residents...")

        total_people_assigned = 0

        for care_home in care_homes:
            if hasattr(care_home, 'residents') and care_home.residents:
                # Get area code for this care home
                area_code = self._get_area_code_from_venue(care_home)

                # Get area ethnicity probabilities
                if area_code:
                    area_probs = self._load_area_ethnicity_probabilities(area_code)
                else:
                    # Fallback probabilities for elderly
                    area_probs = {
                        self.ethnicity_codes['white']: 0.85,
                        self.ethnicity_codes['asian']: 0.08,
                        self.ethnicity_codes['black']: 0.04,
                        self.ethnicity_codes['mixed']: 0.02,
                        self.ethnicity_codes['other']: 0.01
                    }
                    # Normalize probabilities to ensure they sum to 1.0
                    area_probs = self._normalize_probabilities(area_probs)

                # Assign ethnicity to each resident
                for person in care_home.residents:
                    ethnicity = np.random.choice(
                        list(area_probs.keys()),
                        p=list(area_probs.values())
                    )
                    person.ethnicity = ethnicity
                    total_people_assigned += 1

        logger.info(f"Completed care home ethnicity assignment: {total_people_assigned:,} people assigned")

    def assign_ethnicities_to_boarding_schools(self, boarding_schools) -> None:
        """Assign ethnicities to boarding school residents based on area demographics.

        Args:
            boarding_schools (BoardingSchools or similar supergroup): The boarding schools supergroup containing all boarding school objects

        """
        if not boarding_schools:
            return

        logger.info(f"Starting ethnicity assignment for boarding school residents...")

        total_people_assigned = 0

        for boarding_school in boarding_schools:
            if hasattr(boarding_school, 'residents') and boarding_school.residents:
                # Get area code for this boarding school
                area_code = self._get_area_code_from_venue(boarding_school)

                # Get area ethnicity probabilities
                if area_code:
                    area_probs = self._load_area_ethnicity_probabilities(area_code)
                else:
                    # Fallback probabilities for young people
                    area_probs = {
                        self.ethnicity_codes['white']: 0.75,
                        self.ethnicity_codes['asian']: 0.12,
                        self.ethnicity_codes['black']: 0.07,
                        self.ethnicity_codes['mixed']: 0.04,
                        self.ethnicity_codes['other']: 0.02
                    }
                    # Normalize probabilities to ensure they sum to 1.0
                    area_probs = self._normalize_probabilities(area_probs)

                # Assign ethnicity to each resident
                for person in boarding_school.residents:
                    ethnicity = np.random.choice(
                        list(area_probs.keys()),
                        p=list(area_probs.values())
                    )
                    person.ethnicity = ethnicity
                    total_people_assigned += 1

        logger.info(f"Completed boarding school ethnicity assignment: {total_people_assigned:,} people assigned")

    def assign_ethnicities_to_student_dorms(self, student_dorms) -> None:
        """Assign ethnicities to student dorm residents based on area demographics.

        Args:
            student_dorms (StudentDorms or similar supergroup): The student dorms supergroup containing all student dorm objects

        """
        if not student_dorms:
            return

        logger.info(f"Starting ethnicity assignment for student dorm residents...")

        total_people_assigned = 0

        for student_dorm in student_dorms:
            if hasattr(student_dorm, 'residents') and student_dorm.residents:
                # Get area code for this student dorm
                area_code = self._get_area_code_from_venue(student_dorm)

                # Get area ethnicity probabilities
                if area_code:
                    area_probs = self._load_area_ethnicity_probabilities(area_code)
                else:
                    # Fallback probabilities for young adults
                    area_probs = {
                        self.ethnicity_codes['white']: 0.70,
                        self.ethnicity_codes['asian']: 0.15,
                        self.ethnicity_codes['black']: 0.08,
                        self.ethnicity_codes['mixed']: 0.05,
                        self.ethnicity_codes['other']: 0.02
                    }
                    # Normalize probabilities to ensure they sum to 1.0
                    area_probs = self._normalize_probabilities(area_probs)

                # Assign ethnicity to each resident
                for person in student_dorm.residents:
                    ethnicity = np.random.choice(
                        list(area_probs.keys()),
                        p=list(area_probs.values())
                    )
                    person.ethnicity = ethnicity
                    total_people_assigned += 1

        logger.info(f"Completed student dorm ethnicity assignment: {total_people_assigned:,} people assigned")

    def assign_ethnicities_to_all_residents(self, world) -> None:
        """Comprehensive method to assign ethnicities to all residents in the world.

        Args:
            world (World): The world object containing all groups

        """
        logger.info("Starting comprehensive ethnicity assignment for all residents...")

        # Households (relationship-aware assignment)
        if world.households is not None:
            self.assign_ethnicities_to_households(world.households)

        # Care homes (area-based assignment)
        if world.care_homes is not None:
            self.assign_ethnicities_to_care_homes(world.care_homes)

        # Boarding schools (area-based assignment)
        if hasattr(world, 'boarding_schools') and world.boarding_schools is not None:
            self.assign_ethnicities_to_boarding_schools(world.boarding_schools)

        # Student dorms (area-based assignment)
        if world.student_dorms is not None:
            self.assign_ethnicities_to_student_dorms(world.student_dorms)

        logger.info("Completed comprehensive ethnicity assignment for all residents")

    def _normalize_probabilities(self, probabilities: Dict[str, float]) -> Dict[str, float]:
        """Normalize probabilities to ensure they sum to 1.0.

        Args:
            probabilities (Dict[str, float]): Dictionary of probabilities

        Returns:
            Dict[str, float]: Normalized probabilities that sum to 1.0

        """
        total = sum(probabilities.values())
        if abs(total - 1.0) < 1e-10:  # Already normalized within floating point precision
            return probabilities
        elif total > 0:
            # Normalize to sum to 1.0
            return {key: value / total for key, value in probabilities.items()}
        else:
            # All probabilities are zero - return equal probabilities
            n = len(probabilities)
            return {key: 1.0 / n for key in probabilities.keys()}

    def _get_area_code_from_venue(self, venue) -> Optional[str]:
        """Extract area code from any venue (care home, boarding school, student dorm, etc.).

        Args:
            venue (object): The venue object (care home, boarding school, student dorm, etc.)

        Returns:
            str or None: The area code if available

        """
        # Try different possible attributes for area information
        if hasattr(venue, 'area') and venue.area:
            if hasattr(venue.area, 'name'):
                return venue.area.name
            elif hasattr(venue.area, 'id'):
                return str(venue.area.id)

        # Try direct area_code attribute
        if hasattr(venue, 'area_code'):
            return venue.area_code

        # Try coordinates-based lookup if available
        if hasattr(venue, 'coordinates') and venue.coordinates:
            # This would require reverse geocoding - for now return None
            pass

        return None

    def _get_area_code(self, household: Household) -> Optional[str]:
        """Extract area code from household for demographic data lookup.

        Args:
            household (Household): The household object

        Returns:
            str or None: The area code if available

        """
        if hasattr(household, 'area') and household.area:
            if hasattr(household.area, 'name'):
                return household.area.name
            elif hasattr(household.area, 'id'):
                return str(household.area.id)
        return None

    def _classify_person_by_age(self, person: Person) -> str:
        """Classify person into age group based on age thresholds.

        Args:
            person (Person): The person to classify

        Returns:
            str: Age group classification

        """
        age = person.age

        if age <= self.age_thresholds['child_max_age']:
            return self.age_groups['kids']
        elif self.age_thresholds['young_adult_min_age'] <= age <= self.age_thresholds['young_adult_max_age']:
            return self.age_groups['young_adults']
        elif self.age_thresholds['adult_min_age'] <= age <= self.age_thresholds['adult_max_age']:
            return self.age_groups['adults']
        else:
            return self.age_groups['old_adults']

    def _categorize_household_members(self, household: Household) -> Dict[str, List[Person]]:
        """Categorize household residents by age group.

        Args:
            household (Household): The household object

        Returns:
            Dict[str, List[Person]]: Dictionary mapping age groups to lists of people

        """
        categories = {
            self.age_groups['kids']: [],
            self.age_groups['young_adults']: [],
            self.age_groups['adults']: [],
            self.age_groups['old_adults']: []
        }

        for person in household.residents:
            age_group = self._classify_person_by_age(person)
            categories[age_group].append(person)

        return categories

    def _assign_household_ethnicities(self, household: Household, area_code: Optional[str]) -> Dict[str, str]:
        """Assign ethnicities to all residents in a single household.

        Args:
            household (Household): The household object
            area_code (Optional[str]): Area code for demographic data lookup

        Returns:
            Dict[str, str]: Mapping of person IDs to ethnicity codes

        """
        # Categorize household members by age group
        categories = self._categorize_household_members(household)

        adults = categories[self.age_groups['adults']]
        young_adults = categories[self.age_groups['young_adults']]
        kids = categories[self.age_groups['kids']]
        elderly = categories[self.age_groups['old_adults']]

        # Determine who acts as adults for assignment purposes
        if len(adults) == 0 and len(young_adults) > 0:
            # Young adults living independently
            adults_for_assignment = young_adults
            children = kids  # Only actual children inherit
        else:
            # Normal household
            adults_for_assignment = adults
            children = kids + young_adults  # Both inherit from adults

        assignments = {}

        # Assign adult ethnicities first
        adult_assignments = self._assign_adult_ethnicities(adults_for_assignment, area_code)
        assignments.update(adult_assignments)

        # Assign children based on adults
        children_assignments = self._assign_children_ethnicities(children, adult_assignments)
        assignments.update(children_assignments)

        # Assign elderly based on relationship to adults
        elderly_assignments = self._assign_elderly_ethnicities(elderly, adult_assignments, area_code)
        assignments.update(elderly_assignments)

        return assignments

    def _load_area_ethnicity_probabilities(self, area_code: str) -> Dict[str, float]:
        """Load area-specific ethnicity probabilities with caching.
        Uses preloaded data for optimal performance.

        Args:
            area_code (str): The area code

        Returns:
            Dict[str, float]: Mapping of ethnicity codes to probabilities

        """
        cache_key = f"ethnicity_{area_code}"
        if cache_key in self._data_cache:
            return self._data_cache[cache_key]

        try:
            # Detect region: Scotland (S), Northern Ireland (N), or England & Wales (E)
            is_scotland = area_code.startswith('S')
            is_ni = area_code.startswith('N')

            if is_scotland:
                lookup_dict = self._ethnicity_lookup_scotland
                fallback_probs = {
                    self.ethnicity_codes['white']: 0.90,
                    self.ethnicity_codes['asian']: 0.05, 
                    self.ethnicity_codes['black']: 0.02,
                    self.ethnicity_codes['mixed']: 0.02,
                    self.ethnicity_codes['other']: 0.01
                }
            elif is_ni:
                lookup_dict = self._ethnicity_lookup_ni
                fallback_probs = {
                    self.ethnicity_codes['white']: 0.92,
                    self.ethnicity_codes['asian']: 0.04, 
                    self.ethnicity_codes['black']: 0.01,
                    self.ethnicity_codes['mixed']: 0.02,
                    self.ethnicity_codes['other']: 0.01
                }
            else:
                lookup_dict = self._ethnicity_lookup_ew
                fallback_probs = {
                    self.ethnicity_codes['white']: 0.81,
                    self.ethnicity_codes['asian']: 0.09, 
                    self.ethnicity_codes['black']: 0.04,
                    self.ethnicity_codes['mixed']: 0.03,
                    self.ethnicity_codes['other']: 0.03  # Fixed: was 0.02, now sums to 1.0
                }

            # Use O(1) dictionary lookup instead of DataFrame filtering
            probabilities = lookup_dict.get(area_code, fallback_probs)

            # Normalize probabilities to ensure they sum to 1.0
            probabilities = self._normalize_probabilities(probabilities)

            self._data_cache[cache_key] = probabilities
            return probabilities

        except Exception as e:
            logger.warning(f"Error loading ethnicity data for {area_code}: {e}. Using fallback probabilities")
            # Fallback probabilities
            fallback = {
                self.ethnicity_codes['white']: 0.81,
                self.ethnicity_codes['asian']: 0.09,
                self.ethnicity_codes['black']: 0.04,
                self.ethnicity_codes['mixed']: 0.03,
                self.ethnicity_codes['other']: 0.03  # Fixed: was 0.02, now sums to 1.0
            }
            # Normalize fallback probabilities to ensure they sum to 1.0
            fallback = self._normalize_probabilities(fallback)
            self._data_cache[cache_key] = fallback
            return fallback

    def _load_area_diversity_data(self, area_code: str) -> Dict[str, float]:
        """Load area-specific household diversity data with caching.
        Uses preloaded data for optimal performance.

        Args:
            area_code (str): The area code

        Returns:
            Dict[str, float]: Mapping of diversity categories to probabilities

        """
        cache_key = f"diversity_{area_code}"
        if cache_key in self._data_cache:
            return self._data_cache[cache_key]

        try:
            # Detect region: Scotland (S), Northern Ireland (N), or England & Wales (E)
            is_scotland = area_code.startswith('S')
            is_ni = area_code.startswith('N')

            if is_scotland:
                lookup_dict = self._diversity_lookup_scotland
                fallback_diversity = {
                    'single_ethnicity': 0.88,
                    'two_ethnicities': 0.10,  
                    'three_plus_ethnicities': 0.02
                }
            elif is_ni:
                lookup_dict = self._diversity_lookup_ni
                fallback_diversity = {
                    'single_ethnicity': 0.90,
                    'two_ethnicities': 0.08,  
                    'three_plus_ethnicities': 0.02
                }
            else:
                lookup_dict = self._diversity_lookup_ew
                fallback_diversity = {
                    'single_ethnicity': 0.85,
                    'two_ethnicities': 0.12,  
                    'three_plus_ethnicities': 0.03
                }

            # Use O(1) dictionary lookup instead of DataFrame filtering
            diversity = lookup_dict.get(area_code, fallback_diversity)

            # Normalize diversity probabilities to ensure they sum to 1.0
            diversity = self._normalize_probabilities(diversity)

            self._data_cache[cache_key] = diversity
            return diversity

        except Exception as e:
            logger.warning(f"Error loading diversity data for {area_code}: {e}. Using fallback diversity")
            fallback = {
                'single_ethnicity': 0.85,
                'two_ethnicities': 0.12,
                'three_plus_ethnicities': 0.03
            }
            # Normalize fallback diversity probabilities to ensure they sum to 1.0
            fallback = self._normalize_probabilities(fallback)
            self._data_cache[cache_key] = fallback
            return fallback

    def _load_partnership_probabilities(self, area_code: str) -> Dict[str, Dict[str, float]]:
        """Load area-specific partnership probabilities with caching.
        Uses preloaded data for optimal performance.

        Args:
            area_code (str): The area code

        Returns:
            Dict[str, Dict[str, float]]: Nested mapping of first ethnicity to partner ethnicity probabilities

        """
        cache_key = f"partnerships_{area_code}"
        if cache_key in self._data_cache:
            return self._data_cache[cache_key]

        try:            
            # Use O(1) dictionary lookup instead of DataFrame filtering
            partnerships = self._partnership_lookup.get(area_code, {})

            if partnerships:
                # Normalize partnership probabilities for each first ethnicity
                normalized_partnerships = {}
                for first_eth, partner_probs in partnerships.items():
                    normalized_partnerships[first_eth] = self._normalize_probabilities(partner_probs)
                self._data_cache[cache_key] = normalized_partnerships
                return normalized_partnerships

            # Fallback if no data available
            ethnicity_codes = list(self.ethnicity_codes.values())
            equal_prob = 1.0 / len(ethnicity_codes)
            fallback = {}
            for eth1 in ethnicity_codes:
                fallback[eth1] = {eth2: equal_prob for eth2 in ethnicity_codes}

            self._data_cache[cache_key] = fallback
            return fallback

        except Exception as e:
            logger.warning(f"Error loading partnership data for {area_code}: {e}. Using fallback partnerships")
            # Fallback - equal probability for all ethnicities
            ethnicity_codes = list(self.ethnicity_codes.values())
            equal_prob = 1.0 / len(ethnicity_codes)
            fallback = {}
            for eth1 in ethnicity_codes:
                fallback[eth1] = {eth2: equal_prob for eth2 in ethnicity_codes}
            self._data_cache[cache_key] = fallback
            return fallback

    def _assign_adult_ethnicities(self, adults: List[Person], area_code: Optional[str]) -> Dict[str, str]:
        """Assign ethnicities to adults in the household.

        Args:
            adults (List[Person]): List of adult persons
            area_code (Optional[str]): Area code for demographic lookup

        Returns:
            Dict[str, str]: Mapping of person IDs to ethnicity codes

        """
        if not adults:
            return {}

        assignments = {}

        # Get area-specific probabilities
        if area_code:
            adult_probs = self._load_area_ethnicity_probabilities(area_code)
        else:
            # Use default UK probabilities
            adult_probs = {
                self.ethnicity_codes['white']: 0.81,
                self.ethnicity_codes['asian']: 0.09,
                self.ethnicity_codes['black']: 0.04,
                self.ethnicity_codes['mixed']: 0.03,
                self.ethnicity_codes['other']: 0.03  # Fixed: was 0.02, now sums to 1.0
            }
            # Normalize probabilities to ensure they sum to 1.0
            adult_probs = self._normalize_probabilities(adult_probs)

        # Assign first adult based on area probabilities
        first_adult = adults[0]
        first_ethnicity = np.random.choice(
            list(adult_probs.keys()),
            p=list(adult_probs.values())
        )
        assignments[str(first_adult.id)] = first_ethnicity

        # For multiple adults, use area diversity to decide mixed vs single ethnicity
        if len(adults) > 1 and area_code:
            diversity_data = self._load_area_diversity_data(area_code)

            rand = np.random.random()
            if rand < diversity_data['single_ethnicity']:
                # Single ethnicity household
                for adult in adults[1:]:
                    assignments[str(adult.id)] = first_ethnicity
            else:
                # Mixed ethnicity household - use partnership probabilities
                partnerships = self._load_partnership_probabilities(area_code)
                partner_probs = partnerships.get(first_ethnicity, {})

                for adult in adults[1:]:
                    if partner_probs:
                        partner_options = list(partner_probs.keys())
                        partner_probabilities = list(partner_probs.values())
                        ethnicity = np.random.choice(partner_options, p=partner_probabilities)
                    else:
                        # Fallback to area probabilities
                        ethnicity = np.random.choice(
                            list(adult_probs.keys()),
                            p=list(adult_probs.values())
                        )
                    assignments[str(adult.id)] = ethnicity
        else:
            # Single adult or no area code - use area probabilities for remaining adults
            for adult in adults[1:]:
                ethnicity = np.random.choice(
                    list(adult_probs.keys()),
                    p=list(adult_probs.values())
                )
                assignments[str(adult.id)] = ethnicity

        return assignments

    def _assign_children_ethnicities(self, children: List[Person], adult_assignments: Dict[str, str]) -> Dict[str, str]:
        """Assign ethnicities to children based on adult ethnicities.

        Args:
            children (List[Person]): List of children/young adults
            adult_assignments (Dict[str, str]): Adult ethnicity assignments

        Returns:
            Dict[str, str]: Mapping of person IDs to ethnicity codes

        """
        if not children:
            return {}

        assignments = {}
        parent_ethnicities = list(adult_assignments.values())

        # Determine child ethnicity based on parent ethnicities
        if len(parent_ethnicities) == 0:
            # No adults - shouldn't happen, but handle gracefully
            child_ethnicity = self.ethnicity_codes['white']  # Default fallback
        elif len(parent_ethnicities) == 1:
            child_ethnicity = parent_ethnicities[0]
        elif len(set(parent_ethnicities)) == 1:
            # All parents same ethnicity
            child_ethnicity = parent_ethnicities[0]
        else:
            # Mixed ethnicity parents
            child_ethnicity = self.ethnicity_codes['mixed']

        # Assign same ethnicity to all children
        for child in children:
            assignments[str(child.id)] = child_ethnicity

        return assignments

    def _assign_elderly_ethnicities(self, elderly: List[Person], adult_assignments: Dict[str, str], area_code: Optional[str]) -> Dict[str, str]:
        """Assign ethnicities to elderly based on relationship to adults.

        Args:
            elderly (List[Person]): List of elderly persons
            adult_assignments (Dict[str, str]): Adult ethnicity assignments
            area_code (Optional[str]): Area code for demographic lookup

        Returns:
            Dict[str, str]: Mapping of person IDs to ethnicity codes

        """
        if not elderly:
            return {}

        assignments = {}
        adult_ethnicities = list(adult_assignments.values())

        # Handle elderly-only households (no adults present)
        if not adult_ethnicities:
            if area_code:
                elderly_probs = self._load_area_ethnicity_probabilities(area_code)
            else:
                elderly_probs = {
                    self.ethnicity_codes['white']: 0.85,
                    self.ethnicity_codes['asian']: 0.08,
                    self.ethnicity_codes['black']: 0.04,
                    self.ethnicity_codes['mixed']: 0.02,
                    self.ethnicity_codes['other']: 0.01
                }
                # Normalize probabilities to ensure they sum to 1.0
                elderly_probs = self._normalize_probabilities(elderly_probs)

            if len(elderly) == 2:
                # Elderly couple - use same logic as adult couples
                elder1, elder2 = elderly[0], elderly[1]

                # Assign first elderly person
                first_ethnicity = np.random.choice(
                    list(elderly_probs.keys()),
                    p=list(elderly_probs.values())
                )
                assignments[str(elder1.id)] = first_ethnicity

                # Decide if couple should be mixed using area diversity
                if area_code:
                    diversity_data = self._load_area_diversity_data(area_code)
                    if np.random.random() < diversity_data['single_ethnicity']:
                        assignments[str(elder2.id)] = first_ethnicity
                    else:
                        partnerships = self._load_partnership_probabilities(area_code)
                        partner_probs = partnerships.get(first_ethnicity, {})
                        if partner_probs:
                            partner_options = list(partner_probs.keys())
                            partner_probabilities = list(partner_probs.values())
                            partner_ethnicity = np.random.choice(partner_options, p=partner_probabilities)
                        else:
                            partner_ethnicity = np.random.choice(
                                list(elderly_probs.keys()),
                                p=list(elderly_probs.values())
                            )
                        assignments[str(elder2.id)] = partner_ethnicity
                else:
                    # No area data - assign second elderly independently
                    second_ethnicity = np.random.choice(
                        list(elderly_probs.keys()),
                        p=list(elderly_probs.values())
                    )
                    assignments[str(elder2.id)] = second_ethnicity
            else:
                # Single elderly or 3+ elderly - assign independently
                for elderly_person in elderly:
                    ethnicity = np.random.choice(
                        list(elderly_probs.keys()),
                        p=list(elderly_probs.values())
                    )
                    assignments[str(elderly_person.id)] = ethnicity

        elif len(elderly) == 1:
            # Single elderly = parent of one of the adults
            elderly_person = elderly[0]
            parent_ethnicity = np.random.choice(adult_ethnicities)
            assignments[str(elderly_person.id)] = parent_ethnicity

        elif len(elderly) == 2:
            # Elderly couple = parents/grandparents
            elder1, elder2 = elderly[0], elderly[1]

            # Check if any adults are Mixed (indicates inter-ethnic parents)
            if self.ethnicity_codes['mixed'] in adult_ethnicities:
                # Mixed adult means parents had different ethnicities
                possible_combinations = [
                    (self.ethnicity_codes['white'], self.ethnicity_codes['asian']),
                    (self.ethnicity_codes['white'], self.ethnicity_codes['black']),
                    (self.ethnicity_codes['asian'], self.ethnicity_codes['black']),
                    (self.ethnicity_codes['white'], self.ethnicity_codes['mixed']),
                    (self.ethnicity_codes['asian'], self.ethnicity_codes['mixed']),
                    (self.ethnicity_codes['black'], self.ethnicity_codes['mixed'])
                ]
                parent_combo = possible_combinations[np.random.randint(len(possible_combinations))]
                assignments[str(elder1.id)] = parent_combo[0]
                assignments[str(elder2.id)] = parent_combo[1]
            else:
                # No Mixed adults
                unique_adult_ethnicities = list(set(adult_ethnicities))

                if len(unique_adult_ethnicities) == 1:
                    # All adults same ethnicity
                    ethnicity = unique_adult_ethnicities[0]
                    assignments[str(elder1.id)] = ethnicity
                    assignments[str(elder2.id)] = ethnicity
                else:
                    # Multiple adult ethnicities
                    assignments[str(elder1.id)] = unique_adult_ethnicities[0]
                    assignments[str(elder2.id)] = unique_adult_ethnicities[1] if len(unique_adult_ethnicities) > 1 else unique_adult_ethnicities[0]

        else:
            # More than 2 elderly - assign based on adult ethnicities
            for i, elderly_person in enumerate(elderly):
                if i < len(adult_ethnicities):
                    assignments[str(elderly_person.id)] = adult_ethnicities[i]
                else:
                    assignments[str(elderly_person.id)] = np.random.choice(adult_ethnicities)

        return assignments

    def _display_sample_assignments(self, households: Households, total_assigned: int) -> None:
        """Display a sample of ethnicity assignments similar to other JUNE distributors.

        Args:
            households (Households): The households with assigned ethnicities
            total_assigned (int): Total number of people assigned ethnicities

        """
        import random

        print("\n===== Sample of Ethnicity Assignments =====")

        # Collect sample households with residents
        sample_households = []
        for household in households:
            if household.residents and len(household.residents) > 0:
                sample_households.append(household)

        if not sample_households:
            print("No households with residents found for sampling")
            return

        # Show sample of up to 10 households
        sample_size = min(10, len(sample_households))
        sample_households = random.sample(sample_households, sample_size)

        for i, household in enumerate(sample_households, 1):
            area_code = self._get_area_code(household)
            print(f"\nHousehold {i} (Area: {area_code or 'Unknown'}):")

            # Show household composition and ethnicities
            for person in household.residents:
                age_group = self._classify_person_by_age(person)
                ethnicity = getattr(person, 'ethnicity', 'Unassigned')
                print(f"  Person {person.id}: Age {person.age:2d} ({person.sex}) | {age_group:12s} | Ethnicity: {ethnicity}")

            # Show household diversity
            ethnicities = [getattr(person, 'ethnicity', 'Unassigned') for person in household.residents]
            unique_ethnicities = set(ethnicities)
            diversity_type = "Single ethnicity" if len(unique_ethnicities) == 1 else f"Mixed ({len(unique_ethnicities)} ethnicities)"
            print(f"  → Household type: {diversity_type}")

        # Show overall statistics
        print(f"\n===== Ethnicity Assignment Statistics =====")

        # Count ethnicities across all people
        ethnicity_counts = {}
        household_types = {"Single ethnicity": 0, "Mixed": 0}

        for household in households:
            if household.residents:
                # Count individual ethnicities
                for person in household.residents:
                    ethnicity = getattr(person, 'ethnicity', 'Unassigned')
                    ethnicity_counts[ethnicity] = ethnicity_counts.get(ethnicity, 0) + 1

                # Count household types
                household_ethnicities = set(getattr(person, 'ethnicity', 'Unassigned') for person in household.residents)
                if len(household_ethnicities) == 1:
                    household_types["Single ethnicity"] += 1
                else:
                    household_types["Mixed"] += 1

        print(f"Total people assigned ethnicities: {total_assigned:,}")
        print(f"Ethnicity distribution:")

        # Show ethnicity counts
        ethnicity_names = {
            'W': 'White',
            'A': 'Asian/Asian British', 
            'B': 'Black/African/Caribbean/Black British',
            'M': 'Mixed/Multiple ethnic groups',
            'O': 'Other ethnic group'
        }

        for code, count in sorted(ethnicity_counts.items()):
            name = ethnicity_names.get(code, code)
            percentage = (count / total_assigned * 100) if total_assigned > 0 else 0
            print(f"  {name}: {count:,} ({percentage:.1f}%)")

        # Show household diversity statistics
        total_households = sum(household_types.values())
        print(f"\nHousehold diversity:")
        for htype, count in household_types.items():
            percentage = (count / total_households * 100) if total_households > 0 else 0
            print(f"  {htype}: {count:,} ({percentage:.1f}%)")

        print("="*50)

`init()`

Initialize the EthnicityDistributor with configuration and data paths.

Source code in june/distributors/ethnicity_distributor.py

def __init__(self):
    """
    Initialize the EthnicityDistributor with configuration and data paths.
    """
    self.script_dir = Path(__file__).parent.parent.parent  # Go up to june-measles root
    self.ethnicity_dir = self.script_dir / "ethnicity_assigner"

    # Load ethnicity assignment configuration
    self._load_config()

    # Cache for loaded data to avoid repeated I/O
    self._data_cache = {}

    # Don't preload data at init - do it when we know which areas we need
    self._data_loaded = False

`assign_ethnicities_to_all_residents(world)`

Comprehensive method to assign ethnicities to all residents in the world.

Parameters:

Name	Type	Description	Default
`world`	`World`	The world object containing all groups	required

Source code in june/distributors/ethnicity_distributor.py

def assign_ethnicities_to_all_residents(self, world) -> None:
    """Comprehensive method to assign ethnicities to all residents in the world.

    Args:
        world (World): The world object containing all groups

    """
    logger.info("Starting comprehensive ethnicity assignment for all residents...")

    # Households (relationship-aware assignment)
    if world.households is not None:
        self.assign_ethnicities_to_households(world.households)

    # Care homes (area-based assignment)
    if world.care_homes is not None:
        self.assign_ethnicities_to_care_homes(world.care_homes)

    # Boarding schools (area-based assignment)
    if hasattr(world, 'boarding_schools') and world.boarding_schools is not None:
        self.assign_ethnicities_to_boarding_schools(world.boarding_schools)

    # Student dorms (area-based assignment)
    if world.student_dorms is not None:
        self.assign_ethnicities_to_student_dorms(world.student_dorms)

    logger.info("Completed comprehensive ethnicity assignment for all residents")

`assign_ethnicities_to_boarding_schools(boarding_schools)`

Assign ethnicities to boarding school residents based on area demographics.

Parameters:

Name	Type	Description	Default
`boarding_schools`	`BoardingSchools or similar supergroup`	The boarding schools supergroup containing all boarding school objects	required

Source code in june/distributors/ethnicity_distributor.py

def assign_ethnicities_to_boarding_schools(self, boarding_schools) -> None:
    """Assign ethnicities to boarding school residents based on area demographics.

    Args:
        boarding_schools (BoardingSchools or similar supergroup): The boarding schools supergroup containing all boarding school objects

    """
    if not boarding_schools:
        return

    logger.info(f"Starting ethnicity assignment for boarding school residents...")

    total_people_assigned = 0

    for boarding_school in boarding_schools:
        if hasattr(boarding_school, 'residents') and boarding_school.residents:
            # Get area code for this boarding school
            area_code = self._get_area_code_from_venue(boarding_school)

            # Get area ethnicity probabilities
            if area_code:
                area_probs = self._load_area_ethnicity_probabilities(area_code)
            else:
                # Fallback probabilities for young people
                area_probs = {
                    self.ethnicity_codes['white']: 0.75,
                    self.ethnicity_codes['asian']: 0.12,
                    self.ethnicity_codes['black']: 0.07,
                    self.ethnicity_codes['mixed']: 0.04,
                    self.ethnicity_codes['other']: 0.02
                }
                # Normalize probabilities to ensure they sum to 1.0
                area_probs = self._normalize_probabilities(area_probs)

            # Assign ethnicity to each resident
            for person in boarding_school.residents:
                ethnicity = np.random.choice(
                    list(area_probs.keys()),
                    p=list(area_probs.values())
                )
                person.ethnicity = ethnicity
                total_people_assigned += 1

    logger.info(f"Completed boarding school ethnicity assignment: {total_people_assigned:,} people assigned")

`assign_ethnicities_to_care_homes(care_homes)`

Assign ethnicities to care home residents based on area demographics.

Parameters:

Name	Type	Description	Default
`care_homes`	`CareHomes or similar supergroup`	The care homes supergroup containing all care home objects	required

Source code in june/distributors/ethnicity_distributor.py

def assign_ethnicities_to_care_homes(self, care_homes) -> None:
    """Assign ethnicities to care home residents based on area demographics.

    Args:
        care_homes (CareHomes or similar supergroup): The care homes supergroup containing all care home objects

    """
    if not care_homes:
        return

    logger.info(f"Starting ethnicity assignment for care home residents...")

    total_people_assigned = 0

    for care_home in care_homes:
        if hasattr(care_home, 'residents') and care_home.residents:
            # Get area code for this care home
            area_code = self._get_area_code_from_venue(care_home)

            # Get area ethnicity probabilities
            if area_code:
                area_probs = self._load_area_ethnicity_probabilities(area_code)
            else:
                # Fallback probabilities for elderly
                area_probs = {
                    self.ethnicity_codes['white']: 0.85,
                    self.ethnicity_codes['asian']: 0.08,
                    self.ethnicity_codes['black']: 0.04,
                    self.ethnicity_codes['mixed']: 0.02,
                    self.ethnicity_codes['other']: 0.01
                }
                # Normalize probabilities to ensure they sum to 1.0
                area_probs = self._normalize_probabilities(area_probs)

            # Assign ethnicity to each resident
            for person in care_home.residents:
                ethnicity = np.random.choice(
                    list(area_probs.keys()),
                    p=list(area_probs.values())
                )
                person.ethnicity = ethnicity
                total_people_assigned += 1

    logger.info(f"Completed care home ethnicity assignment: {total_people_assigned:,} people assigned")

`assign_ethnicities_to_households(households)`

Main method to assign ethnicities to all residents in households.

Parameters:

Name	Type	Description	Default
`households`	`Households`	The Households supergroup containing all household objects	required

Source code in june/distributors/ethnicity_distributor.py

def assign_ethnicities_to_households(self, households: Households) -> None:
    """Main method to assign ethnicities to all residents in households.

    Args:
        households (Households): The Households supergroup containing all household objects

    """
    logger.info(f"Starting ethnicity assignment for {len(households)} households")

    # First pass: collect all unique area codes that exist in this world
    if not self._data_loaded:
        logger.info("Collecting area codes from households...")
        area_codes_needed = set()
        for household in households:
            area_code = self._get_area_code(household)
            if area_code:
                area_codes_needed.add(area_code)

        logger.info(f"Found {len(area_codes_needed)} unique areas, loading relevant data...")
        self._preload_targeted_data(area_codes_needed)
        self._data_loaded = True

    total_people_assigned = 0
    areas_processed = set()

    for i, household in enumerate(households):
        if i % 1000 == 0 and i > 0:
            logger.info(f"Processed {i:,} households, assigned ethnicities to {total_people_assigned:,} people")

        if household.residents:
            # Get area code for this household
            area_code = self._get_area_code(household)
            areas_processed.add(area_code)

            # Assign ethnicities to all residents
            assignments = self._assign_household_ethnicities(household, area_code)

            # Apply assignments to Person objects
            for person in household.residents:
                if hasattr(person, 'id') and str(person.id) in assignments:
                    person.ethnicity = assignments[str(person.id)]
                    total_people_assigned += 1

    logger.info(f"Completed ethnicity assignment: {total_people_assigned:,} people assigned across {len(areas_processed)} unique areas")

    # Show sample of assignments
    self._display_sample_assignments(households, total_people_assigned)

`assign_ethnicities_to_student_dorms(student_dorms)`

Assign ethnicities to student dorm residents based on area demographics.

Parameters:

Name	Type	Description	Default
`student_dorms`	`StudentDorms or similar supergroup`	The student dorms supergroup containing all student dorm objects	required

Source code in june/distributors/ethnicity_distributor.py

def assign_ethnicities_to_student_dorms(self, student_dorms) -> None:
    """Assign ethnicities to student dorm residents based on area demographics.

    Args:
        student_dorms (StudentDorms or similar supergroup): The student dorms supergroup containing all student dorm objects

    """
    if not student_dorms:
        return

    logger.info(f"Starting ethnicity assignment for student dorm residents...")

    total_people_assigned = 0

    for student_dorm in student_dorms:
        if hasattr(student_dorm, 'residents') and student_dorm.residents:
            # Get area code for this student dorm
            area_code = self._get_area_code_from_venue(student_dorm)

            # Get area ethnicity probabilities
            if area_code:
                area_probs = self._load_area_ethnicity_probabilities(area_code)
            else:
                # Fallback probabilities for young adults
                area_probs = {
                    self.ethnicity_codes['white']: 0.70,
                    self.ethnicity_codes['asian']: 0.15,
                    self.ethnicity_codes['black']: 0.08,
                    self.ethnicity_codes['mixed']: 0.05,
                    self.ethnicity_codes['other']: 0.02
                }
                # Normalize probabilities to ensure they sum to 1.0
                area_probs = self._normalize_probabilities(area_probs)

            # Assign ethnicity to each resident
            for person in student_dorm.residents:
                ethnicity = np.random.choice(
                    list(area_probs.keys()),
                    p=list(area_probs.values())
                )
                person.ethnicity = ethnicity
                total_people_assigned += 1

    logger.info(f"Completed student dorm ethnicity assignment: {total_people_assigned:,} people assigned")

Ethnicity distributor