Worker distributor new

`WorkerDistributorNew`

New Worker Distributor that uses LAD-based likelihood data to assign work locations.

This distributor: 1. Maps person's area to their LAD using geography data 2. Uses likelihood data to determine which destination LAD they work in 3. Maps destination LAD to a specific super area (MSOA) 4. Assigns work sector and lockdown status using existing logic

Source code in june/distributors/worker_distributor_new.py

class WorkerDistributorNew:
    """New Worker Distributor that uses LAD-based likelihood data to assign work locations.

    This distributor:
    1. Maps person's area to their LAD using geography data
    2. Uses likelihood data to determine which destination LAD they work in
    3. Maps destination LAD to a specific super area (MSOA)
    4. Assigns work sector and lockdown status using existing logic

    """

    def __init__(
        self,
        likelihood_df: pd.DataFrame,
        geography_df: pd.DataFrame,
        workers_df: pd.DataFrame,
        sex_industry_df: pd.DataFrame,
        company_closure: dict,
        age_range: List[int],
        sub_sector_ratio: dict,
        sub_sector_distr: dict,
        non_geographical_work_location: dict,
    ):
        """
        Args:
          likelihood_df (pd.DataFrame):
            DataFrame with origin LAD, destination LAD, and likelihood of working there
          geography_df (pd.DataFrame):
            DataFrame mapping area -> msoa -> lad -> region
          workers_df (pd.DataFrame):
            DataFrame with industry-specific employment data by output area and MSOA
          company_closure (dict):
            Lockdown status probabilities by sector
          age_range (List[int]):
            Min and max age for workers
          sub_sector_ratio (dict):
            Key sector ratios by sex
          sub_sector_distr (dict):
            Key sector distributions by sex  
          non_geographical_work_location (dict):
            Special work locations (home, offshore, etc.)
        """
        self.likelihood_df = likelihood_df
        self.geography_df = geography_df
        self.workers_df = workers_df
        self.sex_industry_df = sex_industry_df
        self.age_range = age_range
        self.sub_sector_ratio = sub_sector_ratio
        self.sub_sector_distr = sub_sector_distr
        self.non_geographical_work_location = non_geographical_work_location
        self.company_closure = company_closure
        self._boundary_workers_counter = count()
        self.n_boundary_workers = 0

        # Simplified statistics tracking
        self.stats = {
            'total_workers': 0,
            'assigned_by_lad_likelihood': 0,
            'assigned_home': 0,
            'assigned_out_of_scope': 0,
            'assigned_first_try_msoa': 0,
            'assigned_bounced_back_msoa': 0,
            'assigned_cross_border_ew_to_scotland': 0,
            'assigned_with_industry_data': 0,
            'industry_assignments': {},
            'work_mode_assignments': {},  # Track work mode assignments (Normal/Hybrid/From_Home)
            'msoa_sector_allocations': {},  # Track allocated workers by MSOA and sector
            'msoa_sector_sex_allocations': {},  # Track allocated workers by MSOA, sector, and sex
            'sub_sector_assignments': {},  # Track sub-sector assignments
            'sub_sector_samples': []  # Sample of people with sub-sectors for display
        }

        # Create lookup dictionaries for efficient mapping
        self._create_geography_lookups()
        self._create_likelihood_lookups()
        self._process_workers_data()
        self._process_sex_industry_data()

        # Pre-compute capacity data structure for performance
        self._capacity_lookup = None
        self._world_msoas_cache = None

        # Initialize performance caches
        self._scottish_lad_cache = {}
        self._geography_lads_with_scottish_areas = None
        self._pre_build_lad_regional_cache()

        # Worker distribution optimization caches
        self._lad_msoa_probabilities_cache = {}  # Pre-computed MSOA selection probabilities by LAD
        self._capacity_arrays = {}  # Numpy arrays for fast capacity tracking

    def _pre_build_lad_regional_cache(self):
        """Pre-build LAD regional cache from geography files - no hardcoded lists"""
        # Load all regional geography files to determine LAD regions dynamically
        self._scottish_lads = set()
        self._northern_ireland_lads = set()

        # Load Scottish LADs from file
        try:
            sc_file = paths.data_path / "input/geography/SC_oa_msoa_lad_regions.csv"
            if sc_file.exists():
                sc_geo_df = pd.read_csv(sc_file)
                self._scottish_lads = set(sc_geo_df['lad'].unique())
                logger.info(f"Loaded {len(self._scottish_lads)} Scottish LADs from {sc_file}")
            else:
                logger.warning(f"Scottish geography file not found: {sc_file}")
        except Exception as e:
            logger.warning(f"Failed to load Scottish geography file: {e}")

        # Load Northern Ireland LADs from file  
        try:
            ni_file = paths.data_path / "input/geography/NI_dz_sdz_lgd_lookup.csv"
            if ni_file.exists():
                ni_geo_df = pd.read_csv(ni_file)
                self._northern_ireland_lads = set(ni_geo_df['lad'].unique())
                logger.info(f"Loaded {len(self._northern_ireland_lads)} Northern Ireland LADs from {ni_file}")
            else:
                logger.warning(f"Northern Ireland geography file not found: {ni_file}")
        except Exception as e:
            logger.warning(f"Failed to load Northern Ireland geography file: {e}")

        # Build cache from loaded geography data
        if not self.geography_df.empty:
            # Vectorized operation: filter rows where area starts with 'S'
            scottish_mask = self.geography_df['area'].str.startswith('S', na=False)
            scottish_lads_from_data = set(self.geography_df.loc[scottish_mask, 'lad'].unique())

            # Combine file-based and data-based Scottish LADs
            self._scottish_lads = self._scottish_lads.union(scottish_lads_from_data)
            self._geography_lads_with_scottish_areas = self._scottish_lads

        # Pre-cache all regional LADs for fast lookup
        for lad in self._scottish_lads:
            self._scottish_lad_cache[lad] = True

        for lad in self._northern_ireland_lads:
            self._scottish_lad_cache[lad] = False  # NI LADs are not Scottish

        logger.info(f"Regional LAD cache built: {len(self._scottish_lads)} Scottish, {len(self._northern_ireland_lads)} Northern Ireland")

    def _create_geography_lookups(self):
        """Create lookup dictionaries for area->LAD and LAD->MSOA mappings"""
        # Area to LAD mapping - already vectorized
        self.area_to_lad = dict(zip(self.geography_df['area'], self.geography_df['lad']))

        # LAD to MSOAs mapping (one LAD can have multiple MSOAs) - vectorized
        # Group by LAD and collect unique MSOAs
        self.lad_to_msoas = (
            self.geography_df.groupby('lad')['msoa']
            .apply(lambda x: x.unique().tolist())
            .to_dict()
        )


    def _create_likelihood_lookups(self):
        """Create lookup dictionaries for origin LAD -> destination LAD likelihoods with work types"""
        # Vectorized approach using groupby
        grouped = self.likelihood_df.groupby('Origin area name')

        self.origin_destinations = {}
        for origin, group in grouped:
            destinations = group['Destination area name'].tolist()
            work_types = group['Place of work indicator'].tolist()
            likelihoods = group['Likelihood'].tolist()

            # Create discrete distribution for sampling
            self.origin_destinations[origin] = {
                'destinations': destinations,
                'work_types': work_types,
                'likelihoods': likelihoods,
                'rv': rv_discrete(values=(np.arange(len(destinations)), likelihoods))
            }

    def _process_workers_data(self):
        """Process workers data for industry-specific allocation"""
        # Identify industry columns (exclude geography and total columns)
        non_industry_cols = ['output_area', 'msoa', 'lad', 'Total']
        self.industry_columns = [col for col in self.workers_df.columns if col not in non_industry_cols]

        # Separate MSOA aggregations (ALL rows) from individual output areas
        self.msoa_aggregations = self.workers_df[self.workers_df['output_area'] == 'ALL'].copy()
        self.oa_data = self.workers_df[self.workers_df['output_area'] != 'ALL'].copy()

        # Create MSOA employment capacity lookup - vectorized
        self.msoa_employment_capacity = dict(zip(
            self.msoa_aggregations['msoa'], 
            self.msoa_aggregations['Total']
        ))

        # Create industry profiles - fully vectorized processing
        self.msoa_industry_profiles = {}
        epsilon = 1e-10

        # Process all MSOAs at once
        industry_data = self.msoa_aggregations[self.industry_columns].values.astype(float)
        safe_counts = industry_data + epsilon
        industry_probs = safe_counts / np.sum(safe_counts, axis=1, keepdims=True)

        # Create profiles for all MSOAs without iterrows
        msoa_list = self.msoa_aggregations['msoa'].tolist()
        for idx, msoa in enumerate(msoa_list):
            self.msoa_industry_profiles[msoa] = {
                'counts': industry_data[idx],
                'probabilities': industry_probs[idx],
                'rv': rv_discrete(values=(np.arange(len(self.industry_columns)), industry_probs[idx]))
            }

        # Track allocated workers per MSOA to respect capacity constraints
        # Only initialize for MSOAs that will exist in the world (we'll populate this during distribute())
        self.allocated_workers_per_msoa = {}

    def _process_sex_industry_data(self):
        """Process sex-industry data for LAD-level sex bias in industry allocation"""
        # Create sex-industry probability distributions by LAD - vectorized
        self.sex_industry_profiles = {}
        epsilon = 1e-10

        # Group by LAD and Sex for efficient processing
        grouped = self.sex_industry_df.groupby(['LAD', 'Sex'])

        for (lad, sex), group in grouped:
            if lad not in self.sex_industry_profiles:
                self.sex_industry_profiles[lad] = {}

            # Get industry employment counts (exclude LAD, Sex, Total columns)
            industry_counts = group[self.industry_columns].iloc[0].values.astype(float)

            # Add small epsilon to prevent zero probabilities
            safe_counts = industry_counts + epsilon
            industry_probs = safe_counts / np.sum(safe_counts)

            self.sex_industry_profiles[lad][sex] = {
                'counts': industry_counts,
                'probabilities': industry_probs,
                'rv': rv_discrete(values=(np.arange(len(self.industry_columns)), industry_probs))
            }

    def distribute(self, areas: Areas, super_areas: SuperAreas, population: Population = None):
        """Assign work locations and sectors to eligible people using LAD-based likelihood data.

        Args:
            areas (Areas): 
            super_areas (SuperAreas): 
            population (Population, optional): (Default value = None)

        """
        self.areas = areas
        self.super_areas = super_areas

        # Cache world MSOAs for performance - compute once
        self._world_msoas_cache = set(super_area.name for super_area in self.super_areas.members)

        # Initialize worker tracking only for MSOAs that exist in this world
        self.allocated_workers_per_msoa = {msoa: 0 for msoa in self._world_msoas_cache 
                                         if msoa in self.msoa_employment_capacity}

        # Pre-compute capacity lookup for performance optimization
        self._build_capacity_lookup()

        # Pre-compute LAD MSOA selection probabilities (major optimization)
        self._build_lad_msoa_probabilities_cache()

        # Pre-compute global MSOA selection distribution for massive performance gain
        self._build_global_msoa_distribution()

        # Pre-compute LAD destination selections to eliminate expensive scipy rv.rvs() calls
        self._build_lad_destination_distributions()

        # Pre-compute industry probability matrices to eliminate expensive numpy operations per worker
        self._build_industry_probability_cache()

        # Set up lockdown status arrays
        lockdown_tags = np.array(["key_worker", "random", "furlough"])
        lockdown_tags_idx = np.arange(0, len(lockdown_tags))
        lockdown_tags_probabilities_by_sector = (
            self._parse_closure_probabilities_by_sector(
                company_closure=self.company_closure, lockdown_tags=lockdown_tags
            )
        )

        logger.info("Distributing workers to work locations...")

        # Pre-calculate total areas and workers for better progress logging
        total_areas = len(self.areas)
        total_eligible_workers = 0
        areas_with_workers = 0

        # First pass: count total eligible workers for progress tracking
        for area in self.areas:
            area_workers = sum(1 for person in area.people 
                             if person.primary_activity is None and self.age_range[0] <= person.age <= self.age_range[1])
            if area_workers > 0:
                areas_with_workers += 1
                total_eligible_workers += area_workers

        logger.info(f"Found {total_eligible_workers:,} eligible workers across {areas_with_workers:,}/{total_areas:,} areas")

        worker_samples = []
        processed_areas = 0
        processed_workers = 0

        for i, area in enumerate(iter(self.areas)):
            area_workers = sum(1 for person in area.people if person.primary_activity is None and self.age_range[0] <= person.age <= self.age_range[1])

            # Enhanced progress logging - more frequent for better feedback
            if i % 100 == 0:  # Log every 50 areas or areas with workers
                percent_complete = (i / total_areas) * 100
                worker_percent = (processed_workers / max(total_eligible_workers, 1)) * 100
                logger.info(f"Processing area {i+1:,}/{total_areas:,} ({percent_complete:.1f}%): {area.name} - {area_workers:,} eligible workers | Total processed: {processed_workers:,}/{total_eligible_workers:,} workers ({worker_percent:.1f}%)")

            if area_workers > 0:
                processed_areas += 1

            # Set up lockdown status lottery for this area
            self._lockdown_status_lottery(len(area.people))

            # Get LAD for this area
            area_lad = self._get_area_lad(area.name)

            for person in area.people:
                if person.primary_activity is not None:
                    continue

                if self.age_range[0] <= person.age <= self.age_range[1]:
                    # Assign work location using LAD-based likelihood
                    # This also assigns the sector based on industry data
                    self._assign_work_location_by_lad(person, area_lad)

                    # Assign lockdown status
                    self._assign_lockdown_status(
                        lockdown_tags_probabilities_by_sector,
                        lockdown_tags,
                        lockdown_tags_idx,
                        person,
                    )

                    # Track processed workers for progress logging
                    processed_workers += 1

                    # Collect sample data
                    worker_samples.append({
                        "| Person ID": person.id,
                        "| Home Area": area.name,
                        "| Home LAD": area_lad,
                        "| Person Age": person.age,
                        "| Assigned Work Super Area": person.work_super_area.name if person.work_super_area else "No Assignment",
                        "| Assigned Work Sector": getattr(person, 'sector', None),
                        "| Work Mode": getattr(person, 'work_mode', None),
                        "| Lockdown Status": getattr(person, 'lockdown_status', None),
                    })

        # Final completion message
        logger.info(f"Worker distribution completed: {processed_workers:,}/{total_eligible_workers:,} workers distributed across {processed_areas:,}/{areas_with_workers:,} areas with workers")

        if worker_samples:
            df_sample = pd.DataFrame(worker_samples).sample(n=min(10, len(worker_samples)))
            print("\n===== Sample of Workers Distributed Using LAD-based Likelihood =====")
            print(df_sample.to_string(index=False))

        # Print aggregated statistics summary
        #self._print_allocation_summary()

        logger.info(f"{len(worker_samples)} workers distributed.")

        # Clear caches to free memory
        self._world_msoas_cache = None
        self._capacity_lookup = None
        self._global_msoa_selections = None
        self._global_msoa_idx = None
        self._lad_destination_cache = None
        self._industry_probability_cache = None

    def _get_area_lad(self, area_name: str) -> str:
        """Get LAD for given area name

        Args:
            area_name (str): 

        """
        return self.area_to_lad[area_name]

    def _build_capacity_lookup(self):
        """Pre-build capacity lookup data structure for performance"""
        self._capacity_lookup = {}

        for msoa_name in self._world_msoas_cache:
            if msoa_name in self.msoa_employment_capacity:
                capacity = self.msoa_employment_capacity[msoa_name]
                allocated = self.allocated_workers_per_msoa.get(msoa_name, 0)
                max_capacity = capacity * 1.2

                available_capacity = max(1, max_capacity - allocated)
                self._capacity_lookup[msoa_name] = {
                    'capacity': capacity,
                    'max_capacity': max_capacity,
                    'allocated': allocated,
                    'available': available_capacity,
                    'has_space': allocated < max_capacity
                }

    def _build_lad_msoa_probabilities_cache(self):
        """Pre-compute MSOA selection probabilities for each LAD.
        This eliminates 37M list building + probability calculation operations.

        """
        logger.info("Pre-computing MSOA selection probabilities by LAD...")

        for lad_name, msoas_in_lad in self.lad_to_msoas.items():
            # Get available MSOAs and their capacity weights
            available_msoas = []
            capacity_weights = []

            for msoa in msoas_in_lad:
                if msoa in self._capacity_lookup:
                    available_msoas.append(msoa)
                    capacity_weights.append(self._capacity_lookup[msoa]['capacity'])

            if not available_msoas:
                continue

            # Pre-compute normalized probabilities
            total_weight = sum(capacity_weights)
            if total_weight > 0:
                probabilities = np.array([w / total_weight for w in capacity_weights], dtype=np.float64)
            else:
                probabilities = np.ones(len(available_msoas), dtype=np.float64) / len(available_msoas)

            # Cache the results
            self._lad_msoa_probabilities_cache[lad_name] = {
                'msoas': np.array(available_msoas),
                'probabilities': probabilities
            }

        logger.info(f"Pre-computed probabilities for {len(self._lad_msoa_probabilities_cache)} LADs")

    def _build_global_msoa_distribution(self):
        """Pre-compute global MSOA selection distribution for massive performance gain.

        Instead of computing weights for every worker (O(workers × MSOAs)),
        pre-compute a large array of MSOA selections based on capacity weights.
        This reduces O(workers × MSOAs) to O(1) per worker.

        """
        if not self._capacity_lookup:
            logger.warning("Capacity lookup not built yet, skipping global MSOA distribution")
            return

        logger.info(f"Building pre-computed global MSOA distribution for {len(self._capacity_lookup)} MSOAs...")

        # Extract MSOAs and their capacity weights
        msoas = list(self._capacity_lookup.keys())
        capacity_weights = [self._capacity_lookup[msoa]['capacity'] for msoa in msoas]

        if not msoas:
            logger.warning("No MSOAs available for global distribution")
            self._global_msoa_selections = []
            self._global_msoa_idx = 0
            return

        # Normalize to probabilities
        total_weight = sum(capacity_weights)
        if total_weight == 0:
            logger.warning("Total capacity weight is zero, using uniform distribution")
            probabilities = [1.0/len(msoas) for _ in msoas]
        else:
            probabilities = [w/total_weight for w in capacity_weights]

        # Pre-compute a large array of MSOA selections (100K selections should be enough)
        # This is much faster than computing probabilities for each worker
        selection_size = min(100000, max(10000, len(msoas) * 100))
        self._global_msoa_selections = np.random.choice(msoas, size=selection_size, p=probabilities)
        self._global_msoa_idx = 0

        logger.info(f"Pre-computed {len(self._global_msoa_selections)} MSOA selections from {len(msoas)} MSOAs")

    def _build_lad_destination_distributions(self):
        """Pre-compute LAD destination selections for massive performance gain.

        Instead of calling expensive scipy rv.rvs() for every worker (380K calls),
        pre-compute arrays of destination selections for each origin LAD.
        This eliminates 30M+ scipy calls for full UK runs!

        """
        logger.info(f"Building pre-computed LAD destination distributions for {len(self.origin_destinations)} origin LADs...")

        self._lad_destination_cache = {}
        total_precomputed = 0

        for origin_lad, destinations_data in self.origin_destinations.items():
            if len(destinations_data['destinations']) == 0:
                logger.warning(f"No destinations for origin LAD: {origin_lad}")
                self._lad_destination_cache[origin_lad] = {
                    'dest_selections': [],
                    'work_type_selections': [],
                    'index': 0
                }
                continue

            # Pre-compute a large array of destination selections for this origin LAD
            # Size based on expected workers (more for busy LADs, minimum for all)
            selection_size = min(50000, max(1000, len(destinations_data['destinations']) * 50))

            # Pre-compute destination indices using the rv distribution
            rv_sampler = destinations_data['rv']
            dest_indices = rv_sampler.rvs(size=selection_size)

            # Convert indices to actual destination LADs and work types
            destinations = destinations_data['destinations']
            work_types = destinations_data['work_types']

            dest_selections = [destinations[idx] for idx in dest_indices]
            work_type_selections = [work_types[idx] for idx in dest_indices]

            self._lad_destination_cache[origin_lad] = {
                'dest_selections': dest_selections,
                'work_type_selections': work_type_selections, 
                'index': 0
            }

            total_precomputed += selection_size

        logger.info(f"Pre-computed {total_precomputed} LAD destination selections for {len(self._lad_destination_cache)} origin LADs")

    def _build_industry_probability_cache(self):
        """Pre-compute industry probability matrices for massive performance gain.

        Instead of computing sex_probs * capacity_probs + normalization for every worker,
        pre-compute final probability arrays for all combinations of (msoa, destination_lad, sex).
        This eliminates 380K+ expensive numpy operations per run!

        """
        logger.info(f"Building pre-computed industry probability cache for {len(self._capacity_lookup)} MSOAs × {len(self.sex_industry_profiles)} LADs × 2 sexes...")

        self._industry_probability_cache = {}
        total_combinations = 0

        # Pre-compute for all existing MSOAs in our world
        for msoa in self._capacity_lookup.keys():
            if msoa not in self.msoa_industry_profiles:
                continue

            capacity_probs = self.msoa_industry_profiles[msoa]['probabilities']

            # Pre-compute for all destination LADs that workers might go to
            for destination_lad in self.sex_industry_profiles.keys():
                for sex in ['Male', 'Female']:
                    try:
                        # Get sex-biased industry probabilities
                        sex_industry_data = self.sex_industry_profiles[destination_lad][sex]
                        sex_probs = sex_industry_data['probabilities']

                        # Combine sex bias and capacity weights (the expensive operation!)
                        combined_weights = sex_probs * capacity_probs

                        # Normalize to get final probabilities (another expensive operation!)
                        prob_sum = np.sum(combined_weights)
                        if prob_sum > 0:
                            final_probs = combined_weights / prob_sum
                        else:
                            # Fallback to uniform distribution if all weights are zero
                            final_probs = np.ones(len(self.industry_columns)) / len(self.industry_columns)

                        # Store the pre-computed final probabilities
                        cache_key = (msoa, destination_lad, sex)
                        self._industry_probability_cache[cache_key] = final_probs
                        total_combinations += 1

                    except (KeyError, IndexError) as e:
                        # Skip combinations that don't exist in the data
                        continue

        logger.info(f"Pre-computed industry probabilities for {total_combinations} (MSOA, LAD, sex) combinations")

    def _update_capacity_lookup(self, msoa_name: str):
        """Optimized capacity lookup update

        Args:
            msoa_name (str): 

        """
        if msoa_name in self._capacity_lookup:
            entry = self._capacity_lookup[msoa_name]
            entry['allocated'] += 1
            # Eliminate redundant max() calculation - pre-compute or use simple comparison
            entry['available'] = entry['max_capacity'] - entry['allocated'] if entry['allocated'] < entry['max_capacity'] else 1
            entry['has_space'] = entry['allocated'] < entry['max_capacity']

    def _get_destination_lad_fast(self, origin_lad: str):
        """Ultra-fast LAD destination selection using pre-computed distributions.

        This replaces expensive scipy rv.rvs() calls with O(1) array lookups.
        For 30M workers, this saves 30M scipy operations!

        Args:
            origin_lad (str): 

        Returns:
            tuple: (destination_lad, work_type)

        """
        if origin_lad not in self._lad_destination_cache:
            # Fallback to original method if not in cache
            return self._get_destination_lad_original(origin_lad)

        cache_data = self._lad_destination_cache[origin_lad]

        # Handle empty destinations case
        if not cache_data['dest_selections']:
            raise ValueError(f"No destinations found for origin LAD: {origin_lad}")

        # Get next pre-computed selection (O(1) operation)
        current_idx = cache_data['index']
        destination_lad = cache_data['dest_selections'][current_idx]
        work_type = cache_data['work_type_selections'][current_idx]

        # Advance index with wraparound
        cache_data['index'] = (current_idx + 1) % len(cache_data['dest_selections'])

        return destination_lad, work_type

    def _get_destination_lad_original(self, origin_lad: str):
        """Original expensive method kept as fallback

        Args:
            origin_lad (str): 

        """
        destinations_data = self.origin_destinations[origin_lad]
        if len(destinations_data['destinations']) == 0:
            raise ValueError(f"No destinations found for origin LAD: {origin_lad}")

        dest_idx = destinations_data['rv'].rvs()  # Expensive scipy call!
        destination_lad = destinations_data['destinations'][dest_idx]
        work_type = destinations_data['work_types'][dest_idx]
        return destination_lad, work_type

    def _assign_work_location_by_lad(self, person: Person, origin_lad: str):
        """Assign work location based on LAD likelihood data

        Args:
            person (Person): 
            origin_lad (str): 

        """
        self.stats['total_workers'] += 1

        # Sample destination LAD and work type using pre-computed selections (massive performance gain)
        destination_lad, work_type = self._get_destination_lad_fast(origin_lad)


        # Save work type to person (separate from lockdown status)
        person.work_mode = work_type

        # Track work mode statistics
        self.stats['work_mode_assignments'][work_type] = self.stats['work_mode_assignments'].get(work_type, 0) + 1

        # Handle special work locations
        if destination_lad in self.non_geographical_work_location:
            location_type = self.non_geographical_work_location[destination_lad]
            if location_type == "home":
                self.stats['assigned_home'] += 1
                person.work_super_area = None
                return
            elif location_type == "bind":
                # Direct assignment without fallback - use origin LAD as destination
                selected_msoa = self._select_best_available_msoa()
                self._assign_person_to_msoa(person, selected_msoa, origin_lad)
                return

        # Handle out-of-scope destinations (offshore, outside UK, Northern Ireland LADs, etc.)
        out_of_scope_destinations = {'Offshore Installation', 'Outside UK', 'England', 'Wales', 'Northern Ireland'}
        # Use dynamically loaded Northern Ireland LADs
        if destination_lad in out_of_scope_destinations or destination_lad in self._northern_ireland_lads:
            # Allocate to random MSOA in our world using capacity weights, but use origin LAD for sex bias
            selected_msoa = self._select_best_available_msoa()
            self._assign_person_to_msoa(person, selected_msoa, origin_lad)
            self.stats['assigned_out_of_scope'] += 1
            return

        # Handle cross-border work assignments
        origin_is_scottish = self._is_scottish_lad(origin_lad)
        destination_is_scottish = self._is_scottish_lad(destination_lad)

        # If Scottish person trying to work in England/Wales - mark as out of scope
        if origin_is_scottish and not destination_is_scottish and destination_lad not in self.lad_to_msoas:
            selected_msoa = self._select_best_available_msoa()
            self._assign_person_to_msoa(person, selected_msoa, origin_lad)
            self.stats['assigned_out_of_scope'] += 1
            return

        # Map destination LAD to a specific MSOA/SuperArea using employment capacity
        selected_msoa = self._select_msoa_from_lad(destination_lad)

        if selected_msoa is None:
            # No MSOAs in destination LAD exist in our world - bounce back to global selection
            selected_msoa = self._select_best_available_msoa()
            self._assign_person_to_msoa(person, selected_msoa, destination_lad)
            # Check if this was a cross-border assignment
            if self._is_scottish_lad(destination_lad):
                self.stats['assigned_cross_border_ew_to_scotland'] = self.stats.get('assigned_cross_border_ew_to_scotland', 0) + 1
            else:
                self.stats['assigned_bounced_back_msoa'] += 1
        else:
            # Successfully assigned to MSOA in destination LAD
            self._assign_person_to_msoa(person, selected_msoa, destination_lad)
            self.stats['assigned_first_try_msoa'] += 1

    def _is_scottish_lad(self, lad_name: str) -> bool:
        """Check if a LAD is Scottish - uses dynamically loaded regional data

        Args:
            lad_name (str): 

        """
        # Fast lookup in pre-built cache
        if lad_name in self._scottish_lad_cache:
            return self._scottish_lad_cache[lad_name]

        # If not in cache, check if it's in the dynamically loaded Scottish LADs
        is_scottish = lad_name in self._scottish_lads
        self._scottish_lad_cache[lad_name] = is_scottish
        return is_scottish

    def _select_msoa_from_lad(self, destination_lad: str) -> str:
        """Optimized MSOA selection using pre-computed probabilities

        Args:
            destination_lad (str): 

        """
        # Check if LAD exists in our mapping
        if destination_lad not in self.lad_to_msoas:
            # Check if this is a cross-border assignment (EW -> Scottish LAD)
            if self._is_scottish_lad(destination_lad):
                # English/Welsh person working in Scotland - allow this by bouncing back to available MSOAs
                return None  # Will trigger bounced_back logic in calling function

            print(f"ERROR: Destination LAD '{destination_lad}' not found in lad_to_msoas mapping")
            print(f"Available LADs: {list(self.lad_to_msoas.keys())[:10]}")
            raise KeyError(f"LAD '{destination_lad}' not found in LAD-to-MSOA mapping")

        # Use pre-computed probabilities (massive optimization!)
        if destination_lad not in self._lad_msoa_probabilities_cache:
            return None  # No available MSOAs in this LAD

        cache_data = self._lad_msoa_probabilities_cache[destination_lad]
        available_msoas = cache_data['msoas']
        probabilities = cache_data['probabilities']

        if len(available_msoas) == 1:
            return available_msoas[0]

        # Use numba-accelerated weighted selection
        selected_idx = _weighted_choice_fast(probabilities)
        return available_msoas[selected_idx]

    def _assign_person_to_msoa(self, person: Person, selected_msoa: str, destination_lad: str):
        """Assign person to MSOA and handle industry assignment

        Args:
            person (Person): 
            selected_msoa (str): 
            destination_lad (str): 

        """
        # Direct assignment - assume MSOA exists
        super_area = self.super_areas.members_by_name[selected_msoa]
        super_area.add_worker(person)

        # Update tracking efficiently
        self.allocated_workers_per_msoa[selected_msoa] += 1

        # Update pre-computed capacity lookup
        self._update_capacity_lookup(selected_msoa)

        # Now assign industry based on destination LAD + sex bias + capacity weights
        self._assign_industry_with_capacity_and_sex_bias(person, selected_msoa, destination_lad)

        self.stats['assigned_by_lad_likelihood'] += 1

    def _get_industry_assignment_fast(self, msoa: str, destination_lad: str, sex_key: str):
        """Ultra-fast industry assignment using pre-computed probability matrices.

        This replaces expensive numpy operations (sex_probs * capacity_probs + normalization)
        with O(1) cache lookups. For 30M workers, this saves 30M+ expensive numpy operations!

        Args:
            msoa (str): 
            destination_lad (str): 
            sex_key (str): 

        Returns:
            str: Industry name

        """
        cache_key = (msoa, destination_lad, sex_key)

        if cache_key in self._industry_probability_cache:
            # Use pre-computed probabilities (massive performance gain)
            final_probs = self._industry_probability_cache[cache_key]
        else:
            # Fallback to original expensive method if not in cache
            return self._get_industry_assignment_original(msoa, destination_lad, sex_key)

        # Fast industry selection using pre-computed probabilities
        industry_idx = np.random.choice(len(self.industry_columns), p=final_probs)
        return self.industry_columns[industry_idx]

    def _get_industry_assignment_original(self, msoa: str, destination_lad: str, sex_key: str):
        """Original expensive method kept as fallback

        Args:
            msoa (str): 
            destination_lad (str): 
            sex_key (str): 

        """
        # Get sex-biased industry probabilities from destination work LAD
        sex_industry_data = self.sex_industry_profiles[destination_lad][sex_key]
        sex_probs = sex_industry_data['probabilities']

        # Get MSOA capacity-based industry probabilities
        msoa_industry_data = self.msoa_industry_profiles[msoa]
        capacity_probs = msoa_industry_data['probabilities']

        # Combine sex bias and capacity weights (expensive operations!)
        combined_weights = sex_probs * capacity_probs
        final_probs = combined_weights / np.sum(combined_weights)

        if len(final_probs) == 0 or np.sum(final_probs) == 0:
            raise ValueError(f"Empty probabilities for industry assignment: MSOA={msoa}, LAD={destination_lad}, sex={sex_key}")

        industry_idx = np.random.choice(len(self.industry_columns), p=final_probs)
        return self.industry_columns[industry_idx]

    def _assign_industry_with_capacity_and_sex_bias(self, person: Person, msoa: str, destination_lad: str):
        """Industry assignment using pre-computed probability matrices for massive performance gain

        Args:
            person (Person): 
            msoa (str): 
            destination_lad (str): 

        """
        sex_key = 'Female' if person.sex == 'f' else 'Male'
        industry_name = self._get_industry_assignment_fast(msoa, destination_lad, sex_key)

        # Store the sector code
        person.sector = self._map_industry_to_sector(industry_name)

        # Assign sub-sector if applicable
        if person.sector in self.sub_sector_ratio:
            self._assign_sub_sector(person)

        # Track allocation by MSOA, sector, and sex efficiently
        self._track_msoa_sector_allocation_fast(msoa, person.sector, person.sex)

        # Update stats
        self.stats['assigned_with_industry_data'] += 1
        self.stats['industry_assignments'][industry_name] = self.stats['industry_assignments'].get(industry_name, 0) + 1

    def _map_industry_to_sector(self, industry_name: str) -> str:
        """Map full industry name to simplified sector code

        Args:
            industry_name (str): 

        """
        # Create a mapping from industry names to sector codes
        industry_to_sector = {
            'Agriculture; Forestry; Fishing': 'A',
            'Mining and Quarrying': 'B',
            'Manufacturing': 'C',
            'Electricity, Gas, Steam and Air Conditioning Supply': 'D',
            'Water Supply; Sewage; Waste Management and Remediation activities': 'E',
            'Construction': 'F',
            'Wholesale and Retail trade; Repair of Motor Vehicles and Motorcycles': 'G',
            'Transport and Storage': 'H',
            'Accommodation and Food Service Activities': 'I',
            'Information and Communication': 'J',
            'Financial and Insurance Activities': 'K',
            'Real Estate Activities': 'L',
            'Professional Scientific and Technical Activities': 'M',
            'Administrative and Support Service Activities': 'N',
            'Public Administration and Defence; Compulsory Social Security': 'O',
            'Education': 'P',
            'Human Health and Social Work Activities': 'Q',
            'Other': 'R'  # Aggregated category for all R, S, T, U sectors
        }

        return industry_to_sector.get(industry_name, 'Z')  # 'Z' for unknown

    def _track_msoa_sector_allocation_fast(self, msoa: str, sector: str, person_sex: str):
        """Optimized tracking of worker allocation by MSOA, sector, and sex

        Args:
            msoa (str): 
            sector (str): 
            person_sex (str): 

        """
        # Direct tracking - assume MSOA is valid
        msoa_data = self.stats['msoa_sector_allocations'].setdefault(msoa, {})
        msoa_data[sector] = msoa_data.get(sector, 0) + 1

        # Track by sex as well
        msoa_sex_data = self.stats['msoa_sector_sex_allocations'].setdefault(msoa, {})
        sector_sex_data = msoa_sex_data.setdefault(sector, {'Female': 0, 'Male': 0})
        sex_key = 'Female' if person_sex == 'f' else 'Male'
        sector_sex_data[sex_key] += 1

    def _track_msoa_sector_allocation(self, msoa: str, sector: str):
        """Legacy method - kept for backward compatibility

        Args:
            msoa (str): 
            sector (str): 

        """
        self._track_msoa_sector_allocation_fast(msoa, sector)

    def _select_best_available_msoa(self) -> str:
        """Select best available MSOA using pre-computed distribution for massive performance gain

        """
        return self._select_best_available_msoa_fast()

    def _select_best_available_msoa_fast(self) -> str:
        """Ultra-fast MSOA selection using pre-computed distribution.

        This replaces O(MSOAs) computation per worker with O(1) array lookup.
        For 30M workers × 7K MSOAs, this saves ~210 billion operations!

        """
        if not hasattr(self, '_global_msoa_selections') or len(self._global_msoa_selections) == 0:
            # Fallback to original method if pre-computation failed
            return self._select_best_available_msoa_original()

        # Get next pre-computed selection (O(1) operation)
        selected_msoa = self._global_msoa_selections[self._global_msoa_idx]

        # Advance index with wraparound
        self._global_msoa_idx = (self._global_msoa_idx + 1) % len(self._global_msoa_selections)

        return selected_msoa

    def _select_best_available_msoa_original(self) -> str:
        """Original expensive method kept as fallback

        """
        # Use all MSOAs with capacity-based weights
        available_msoas = []
        capacity_weights = []

        for msoa_name, capacity_info in self._capacity_lookup.items():
            available_msoas.append(msoa_name)
            # Use original capacity as weight (bigger capacity = higher probability, even when overcrowded)
            capacity_weights.append(capacity_info['capacity'])

        if len(available_msoas) == 0:
            print(f"ERROR: No available MSOAs in _select_best_available_msoa")
            print(f"_capacity_lookup size: {len(self._capacity_lookup)}")
            print(f"_world_msoas_cache size: {len(self._world_msoas_cache) if self._world_msoas_cache else 'None'}")
            raise ValueError("No available MSOAs for global selection")

        # Select MSOA weighted by employment capacity
        if len(available_msoas) == 1:
            return available_msoas[0]

        # Weighted selection
        total_weight = sum(capacity_weights)
        probabilities = [w/total_weight for w in capacity_weights]
        return np.random.choice(available_msoas, p=probabilities)


    def _print_allocation_summary(self):
        """Print simplified allocation summary"""
        total = self.stats['total_workers']
        if total == 0:
            print("\n===== Worker Allocation Summary =====")
            print("No workers were processed.")
            return

        print("\n===== Worker Allocation Summary =====")
        print(f"Total workers processed: {total:,}")
        print()

        # Calculate actual workplace assignments (exclude work from home)
        workplace_assignments = total - self.stats['assigned_home']

        # Work from home
        home = self.stats['assigned_home']
        if home > 0:
            print(f"🏠 Assigned to work from home: {home:,} ({100*home/total:.1f}%)")

        # Workplace allocation breakdown
        if workplace_assignments > 0:
            print(f"🏢 Assigned to workplace: {workplace_assignments:,} ({100*workplace_assignments/total:.1f}%)")

            # First try assignments (to destination LAD MSOAs)
            first_try = self.stats['assigned_first_try_msoa']
            if first_try > 0:
                print(f"   ✓ Assigned to destination LAD MSOA: {first_try:,} ({100*first_try/workplace_assignments:.1f}%)")

            # Bounced back assignments (destination LAD MSOAs don't exist)
            bounced_back = self.stats['assigned_bounced_back_msoa']
            if bounced_back > 0:
                print(f"   ↩️ Bounced back (destination MSOA not in world): {bounced_back:,} ({100*bounced_back/workplace_assignments:.1f}%)")

            # Out-of-scope assignments
            out_of_scope = self.stats['assigned_out_of_scope']
            if out_of_scope > 0:
                print(f"   🌐 From out-of-scope destinations: {out_of_scope:,} ({100*out_of_scope/workplace_assignments:.1f}%)")

        # Industry allocation summary
        print()
        print("=== Industry Allocation ===")
        industry_with_data = self.stats['assigned_with_industry_data']
        print(f"Workers assigned using MSOA industry data: {industry_with_data:,}")

        # Show top 5 assigned industries
        if self.stats['industry_assignments']:
            sorted_industries = sorted(self.stats['industry_assignments'].items(), key=lambda x: x[1], reverse=True)
            print(f"\nTop 5 assigned industries:")
            for industry, count in sorted_industries[:5]:
                percentage = 100 * count / total if total > 0 else 0
                # Shorten industry names for display
                short_name = industry.split(';')[0] if ';' in industry else industry
                if len(short_name) > 40:
                    short_name = short_name[:37] + "..."
                print(f"  {short_name}: {count:,} ({percentage:.1f}%)")

        # Sub-sector assignments
        if self.stats['sub_sector_assignments']:
            total_sub_sectors = sum(self.stats['sub_sector_assignments'].values())
            print(f"\nSub-sector assignments: {total_sub_sectors:,} workers assigned specialized sub-sectors")

            # Show top 5 sub-sectors
            sorted_sub_sectors = sorted(self.stats['sub_sector_assignments'].items(), key=lambda x: x[1], reverse=True)
            print("Top 5 sub-sectors:")
            for sub_sector, count in sorted_sub_sectors[:5]:
                percentage = 100 * count / total_sub_sectors if total_sub_sectors > 0 else 0
                print(f"  {sub_sector}: {count:,} ({percentage:.1f}%)")

            # Show sample assignments
            if self.stats['sub_sector_samples']:
                print(f"\nSample sub-sector assignments:")
                print(f"{'ID':>8} {'Sex':>3} {'Sector':>6} {'Sub-sector':>25} {'Home Area':>15} {'Work Area':>15}")
                print(f"{'-'*8} {'-'*3} {'-'*6} {'-'*25} {'-'*15} {'-'*15}")
                for sample in self.stats['sub_sector_samples'][:10]:  # Show first 10
                    print(f"{sample['person_id']:>8} {sample['sex']:>3} {sample['sector']:>6} {sample['sub_sector']:>25} {sample['area']:>15} {sample['work_area']:>15}")

        # MSOA capacity usage
        if hasattr(self, 'allocated_workers_per_msoa'):
            over_capacity = sum(1 for msoa, allocated in self.allocated_workers_per_msoa.items() 
                              if allocated > self.msoa_employment_capacity.get(msoa, 0))
            if over_capacity > 0:
                print(f"\nMSOAs operating over recorded capacity: {over_capacity:,}")

        print("=" * 40)

        # Print MSOA sector comparison
        self._print_msoa_sector_comparison()

    def _print_msoa_sector_comparison(self):
        """Print detailed comparison of allocated vs source data by MSOA and sector"""
        print("\n=== MSOA Sector Allocation vs Source Data ===")

        # Create reverse mapping from sector codes to industry names
        sector_to_industry = {}
        for industry, sector in zip(self.industry_columns, ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U']):
            sector_to_industry[sector] = industry

        # Use cached world MSOAs for performance
        world_msoas = self._world_msoas_cache

        # Filter to only MSOAs that exist in the world and have capacity data
        valid_msoas = [(msoa, capacity) for msoa, capacity in self.msoa_employment_capacity.items() 
                      if msoa in world_msoas]
        valid_msoas.sort(key=lambda x: x[1], reverse=True)

        if not valid_msoas:
            print("No valid MSOAs found that exist in both census data and world SuperAreas")
            return

        # Show data coverage statistics
        total_census_msoas = len(self.msoa_employment_capacity)
        total_world_msoas = len(world_msoas)
        matched_msoas = len(valid_msoas)

        print(f"Data Coverage: {matched_msoas:,} MSOAs matched between census ({total_census_msoas:,}) and world ({total_world_msoas:,})")

        # Show top 5 MSOAs by total capacity that exist in the world
        top_msoas = valid_msoas

        for msoa, total_capacity in top_msoas:
            print(f"\n--- {msoa} (Capacity: {total_capacity:,}) ---")

            # Get source data for this MSOA
            msoa_source = self.msoa_aggregations[self.msoa_aggregations['msoa'] == msoa]
            if msoa_source.empty:
                print("  No source data available")
                continue

            source_row = msoa_source.iloc[0]
            allocated_data = self.stats['msoa_sector_allocations'].get(msoa, {})
            total_allocated = self.allocated_workers_per_msoa.get(msoa, 0)

            print(f"  Total: Source={total_capacity:,}, Allocated={total_allocated:,}, Diff={total_allocated-total_capacity:+,}")

            # Show top sectors for this MSOA
            source_sectors = []
            for industry in self.industry_columns:
                if industry in source_row:
                    count = source_row[industry]
                    if count > 0:
                        sector = self._map_industry_to_sector(industry)
                        source_sectors.append((sector, industry, count))

            # Sort by source count and show top 5
            source_sectors.sort(key=lambda x: x[2], reverse=True)

            print(f"  {'Sector':>6} {'Industry':>25} {'Source':>8} {'Allocated':>10} {'F/M':>8} {'Diff':>8} {'%Diff':>8}")
            print(f"  {'-'*6} {'-'*25} {'-'*8} {'-'*10} {'-'*8} {'-'*8} {'-'*8}")

            for sector, industry, source_count in source_sectors:
                allocated_count = allocated_data.get(sector, 0)
                diff = allocated_count - source_count
                percent_diff = (diff / source_count * 100) if source_count > 0 else 0

                # Get gender breakdown for this sector
                sex_data = self.stats['msoa_sector_sex_allocations'].get(msoa, {}).get(sector, {'Female': 0, 'Male': 0})
                female_count = sex_data['Female']
                male_count = sex_data['Male']
                gender_ratio = f"{female_count}/{male_count}" if allocated_count > 0 else "0/0"

                # Shorten industry name for display
                short_industry = industry.split(';')[0][:25]

                print(f"  {sector:>6} {short_industry:>25} {source_count:>8} {allocated_count:>10} {gender_ratio:>8} {diff:>+8} {percent_diff:>+7.1f}%")

        # Summary statistics across all MSOAs that exist in the world
        print(f"\n--- Summary Across World MSOAs ({len(valid_msoas)} MSOAs) ---")

        # Calculate totals only for MSOAs that exist in the world
        total_source = sum(capacity for _, capacity in valid_msoas)
        total_allocated = sum(self.allocated_workers_per_msoa.get(msoa, 0) for msoa, _ in valid_msoas)
        overall_diff = total_allocated - total_source
        overall_percent = (overall_diff / total_source * 100) if total_source > 0 else 0

        print(f"Total Employment: Source={total_source:,}, Allocated={total_allocated:,}")
        print(f"Overall Difference: {overall_diff:+,} ({overall_percent:+.1f}%)")

        # Show which MSOAs have largest over/under allocation (only for world MSOAs)
        differences = []
        for msoa, source_cap in valid_msoas:
            allocated = self.allocated_workers_per_msoa.get(msoa, 0)
            diff = allocated - source_cap
            if source_cap > 0:  # Only consider MSOAs with actual capacity
                percent_diff = diff / source_cap * 100
                differences.append((msoa, diff, percent_diff, source_cap))

        # Most over-allocated
        differences.sort(key=lambda x: x[2], reverse=True)
        print(f"\nMost Over-allocated MSOAs:")
        for msoa, diff, percent_diff, capacity in differences[:3]:
            print(f"  {msoa}: {diff:+,} ({percent_diff:+.1f}%) from {capacity:,}")

        # Most under-allocated
        differences.sort(key=lambda x: x[2])
        print(f"\nMost Under-allocated MSOAs:")
        for msoa, diff, percent_diff, capacity in differences[:3]:
            print(f"  {msoa}: {diff:+,} ({percent_diff:+.1f}%) from {capacity:,}")

        print("=" * 60)



    def _assign_sub_sector(self, person):
        """Assign sub-sector job as defined in config

        Args:
            person: 

        """
        MC_random = np.random.uniform()
        ratio = self.sub_sector_ratio[person.sector][person.sex]
        distr = self.sub_sector_distr[person.sector][person.sex]
        if MC_random < ratio:
            sub_sector_idx = rv_discrete(values=(np.arange(len(distr)), distr)).rvs()
            person.sub_sector = self.sub_sector_distr[person.sector]["label"][
                sub_sector_idx
            ]

            # Track sub-sector assignment
            sub_sector_key = f"{person.sector}:{person.sub_sector}"
            self.stats['sub_sector_assignments'][sub_sector_key] = self.stats['sub_sector_assignments'].get(sub_sector_key, 0) + 1

            # Keep sample of sub-sector assignments (first 20 for display)
            if len(self.stats['sub_sector_samples']) < 20:
                self.stats['sub_sector_samples'].append({
                    'person_id': person.id,
                    'sector': person.sector,
                    'sub_sector': person.sub_sector,
                    'sex': person.sex,
                    'area': person.area.name,
                    'work_area': person.work_super_area.name if person.work_super_area else "No Assignment"
                })

    def _lockdown_status_lottery(self, n_workers):
        """Create lockdown status lottery for workers

        Args:
            n_workers: 

        """
        self.lockdown_status_random = np.random.choice(2, n_workers, p=[4 / 5, 1 / 5])

    def _parse_closure_probabilities_by_sector(
        self, company_closure: dict, lockdown_tags: List
    ):
        """Parse closure probabilities from config

        Args:
            company_closure (dict): 
            lockdown_tags (List): 

        """
        ret = {}
        for sector in company_closure:
            ret[sector] = np.array(
                [
                    self.company_closure[sector][lockdown_tags[0]],
                    self.company_closure[sector][lockdown_tags[1]], 
                    self.company_closure[sector][lockdown_tags[2]],
                ]
            )
        return ret

    def _assign_lockdown_status(
        self,
        probabilities_by_sector: dict,
        lockdown_tags: List[str],
        lockdown_tags_idx: List[int],
        person: Person,
    ):
        """Assign lockdown_status based on work mode and sector probabilities.
        Work mode influences lockdown status:
        - From_Home: Always furlough (already remote)
        - Hybrid: Never key_worker (essential services need full physical presence)
        - Normal: Use full sector probability distribution

        Args:
            probabilities_by_sector (dict): 
            lockdown_tags (List[str]): 
            lockdown_tags_idx (List[int]): 
            person (Person): 

        """

        work_mode = getattr(person, 'work_mode', 'Normal')

        if work_mode == 'From_Home':
            # Remote workers effectively can't go to workplace during lockdowns
            person.lockdown_status = "furlough"

        elif work_mode == 'Hybrid':
            # Hybrid workers are never key workers (can't do essential services remotely)
            # Choose between furlough and random based on sector, excluding key_worker
            sector_probs = probabilities_by_sector[person.sector]

            # Get furlough and random probabilities, normalize them
            furlough_prob = sector_probs[2]  # furlough is index 2
            random_prob = sector_probs[1]    # random is index 1
            total_non_key = furlough_prob + random_prob

            if total_non_key > 0:
                # Randomly assign between furlough and random, proportionally
                if np.random.random() < furlough_prob / total_non_key:
                    person.lockdown_status = "furlough"
                else:
                    person.lockdown_status = "random"
            else:
                # Fallback: if sector has no furlough/random (100% key workers), make them random
                person.lockdown_status = "random"

        else:  # work_mode == 'Normal' or unknown
            # Normal workers use full sector probability distribution
            idx = random_choice_numba(
                lockdown_tags_idx, probabilities_by_sector[person.sector]
            )

            # Currently all people definitely not furloughed or key are assigned a 'random' tag which allows for
            # them to dynamically be sent to work. For now we fix this so that the same 1/5 people go to work once a week
            # rather than a 1/5 chance that a person with a 'random' tag goes to work.
            # If commented out then people will be correctly assigned random tag for going to work randomly
            # if value == "random" and self.lockdown_status_random[idx] == 0:
            #    value = "furlough"

            person.lockdown_status = lockdown_tags[idx]


    @classmethod
    def for_super_areas(
        cls,
        area_names: List[str],
        config_file: str = default_config_file,
        policy_config_file: str = default_policy_config_file,
    ) -> "WorkerDistributorNew":
        """Create WorkerDistributorNew for specific super areas

        Args:
            area_names (List[str]): 
            config_file (str, optional): (Default value = default_config_file)
            policy_config_file (str, optional): (Default value = default_policy_config_file)

        """
        return cls.from_file(
            area_names,
            config_file,
            policy_config_file,
        )

    @classmethod
    def from_file(
        cls,
        area_names: List[str] = None,
        config_file: str = default_config_file,
        policy_config_file: str = default_policy_config_file,
    ) -> "WorkerDistributorNew":
        """Create WorkerDistributorNew from data files, automatically detecting which regional
        data to load based on area codes (E=England, W=Wales, S=Scotland)

        Args:
            area_names (List[str], optional): List of SuperArea names for which to initiate WorkerDistributorNew (Default value = None)
            config_file (str, optional): Configuration file with worker distributor settings (Default value = default_config_file)
            policy_config_file (str, optional): Policy configuration file with company closure settings (Default value = default_policy_config_file)

        """
        area_names = area_names or []

        # Initialize empty dataframes
        likelihood_df = pd.DataFrame()
        workers_df = pd.DataFrame() 
        sex_industry_df = pd.DataFrame()
        geography_df = pd.DataFrame()

        # Determine which regions we need
        # IMPORTANT: For cross-border work assignments, we need to load ALL geography data
        # even if our world only contains areas from one region, because likelihood data
        # may reference LADs from other regions as work destinations
        if area_names:
            # Use provided area names to determine regions that have areas in our world
            area_codes = set(area_names)
            has_scotland_areas = any(area.startswith('S') for area in area_codes)
            has_england_wales_areas = any(area.startswith(('E', 'W')) for area in area_codes)
            has_northern_ireland_areas = any(area.startswith('N') for area in area_codes)

            # Always load all regional data to support cross-border assignments
            # People in one region may work in LADs from other regions
            need_scotland = True
            need_england_wales = True  
            need_northern_ireland = True

            logger.info(f"World contains: Scotland areas={has_scotland_areas}, England/Wales areas={has_england_wales_areas}, Northern Ireland areas={has_northern_ireland_areas}")
            logger.info("Loading all regional data to support cross-border work assignments")
        else:
            # If no area names provided, load all regions
            need_scotland = True
            need_england_wales = True
            need_northern_ireland = True

        logger.info(f"Data loading strategy: Scotland={need_scotland}, England/Wales={need_england_wales}, Northern Ireland={need_northern_ireland}")

        # Load Scottish data if needed
        if need_scotland:
            logger.info("Loading Scottish work data files...")
            likelihood_df = pd.concat([likelihood_df, pd.read_csv(default_likelihood_file)], ignore_index=True)
            workers_df = pd.concat([workers_df, pd.read_csv(default_workers_file)], ignore_index=True)
            sex_industry_df = pd.concat([sex_industry_df, pd.read_csv(default_sex_industry_file)], ignore_index=True)
            geography_df = pd.concat([geography_df, pd.read_csv(default_geography_file)], ignore_index=True)

        # Load England/Wales data if needed
        if need_england_wales:
            logger.info("Loading England/Wales work data files...")
            ew_likelihood_df = pd.read_csv(default_ew_likelihood_file)
            likelihood_df = pd.concat([likelihood_df, ew_likelihood_df], ignore_index=True)
            workers_df = pd.concat([workers_df, pd.read_csv(default_ew_workers_file)], ignore_index=True)
            sex_industry_df = pd.concat([sex_industry_df, pd.read_csv(default_ew_sex_industry_file)], ignore_index=True)
            geography_df = pd.concat([geography_df, pd.read_csv(default_ew_geography_file, low_memory=False)], ignore_index=True)


        # Load Northern Ireland data if needed
        if need_northern_ireland:
            logger.info("Loading Northern Ireland work data files...")
            try:
                ni_likelihood_df = pd.read_csv(default_ni_likelihood_file)
                likelihood_df = pd.concat([likelihood_df, ni_likelihood_df], ignore_index=True)
                workers_df = pd.concat([workers_df, pd.read_csv(default_ni_workers_file)], ignore_index=True)
                sex_industry_df = pd.concat([sex_industry_df, pd.read_csv(default_ni_sex_industry_file)], ignore_index=True)
                geography_df = pd.concat([geography_df, pd.read_csv(default_ni_geography_file)], ignore_index=True)
                logger.info("Successfully loaded Northern Ireland work data files")
            except FileNotFoundError as e:
                logger.warning(f"Northern Ireland work data file not found: {e}")
                logger.info("Continuing without Northern Ireland work data - NI workers will be handled as out-of-scope")

        # Load config files
        with open(config_file, 'r') as f:
            config = yaml.safe_load(f)

        with open(policy_config_file, 'r') as f:
            policy_config = yaml.safe_load(f)

        return cls(
            likelihood_df=likelihood_df,
            geography_df=geography_df,
            workers_df=workers_df,
            sex_industry_df=sex_industry_df,
            company_closure=policy_config["company_closure"]["sectors"],
            age_range=config['age_range'],
            sub_sector_ratio=config['sub_sector_ratio'],
            sub_sector_distr=config['sub_sector_distr'],
            non_geographical_work_location=config['non_geographical_work_location']
        )

`init(likelihood_df, geography_df, workers_df, sex_industry_df, company_closure, age_range, sub_sector_ratio, sub_sector_distr, non_geographical_work_location)`

Parameters:

Name	Type	Description	Default
`likelihood_df`	`DataFrame`	DataFrame with origin LAD, destination LAD, and likelihood of working there	required
`geography_df`	`DataFrame`	DataFrame mapping area -> msoa -> lad -> region	required
`workers_df`	`DataFrame`	DataFrame with industry-specific employment data by output area and MSOA	required
`company_closure`	`dict`	Lockdown status probabilities by sector	required
`age_range`	`List[int]`	Min and max age for workers	required
`sub_sector_ratio`	`dict`	Key sector ratios by sex	required
`sub_sector_distr`	`dict`	Key sector distributions by sex	required
`non_geographical_work_location`	`dict`	Special work locations (home, offshore, etc.)	required

Source code in june/distributors/worker_distributor_new.py

def __init__(
    self,
    likelihood_df: pd.DataFrame,
    geography_df: pd.DataFrame,
    workers_df: pd.DataFrame,
    sex_industry_df: pd.DataFrame,
    company_closure: dict,
    age_range: List[int],
    sub_sector_ratio: dict,
    sub_sector_distr: dict,
    non_geographical_work_location: dict,
):
    """
    Args:
      likelihood_df (pd.DataFrame):
        DataFrame with origin LAD, destination LAD, and likelihood of working there
      geography_df (pd.DataFrame):
        DataFrame mapping area -> msoa -> lad -> region
      workers_df (pd.DataFrame):
        DataFrame with industry-specific employment data by output area and MSOA
      company_closure (dict):
        Lockdown status probabilities by sector
      age_range (List[int]):
        Min and max age for workers
      sub_sector_ratio (dict):
        Key sector ratios by sex
      sub_sector_distr (dict):
        Key sector distributions by sex  
      non_geographical_work_location (dict):
        Special work locations (home, offshore, etc.)
    """
    self.likelihood_df = likelihood_df
    self.geography_df = geography_df
    self.workers_df = workers_df
    self.sex_industry_df = sex_industry_df
    self.age_range = age_range
    self.sub_sector_ratio = sub_sector_ratio
    self.sub_sector_distr = sub_sector_distr
    self.non_geographical_work_location = non_geographical_work_location
    self.company_closure = company_closure
    self._boundary_workers_counter = count()
    self.n_boundary_workers = 0

    # Simplified statistics tracking
    self.stats = {
        'total_workers': 0,
        'assigned_by_lad_likelihood': 0,
        'assigned_home': 0,
        'assigned_out_of_scope': 0,
        'assigned_first_try_msoa': 0,
        'assigned_bounced_back_msoa': 0,
        'assigned_cross_border_ew_to_scotland': 0,
        'assigned_with_industry_data': 0,
        'industry_assignments': {},
        'work_mode_assignments': {},  # Track work mode assignments (Normal/Hybrid/From_Home)
        'msoa_sector_allocations': {},  # Track allocated workers by MSOA and sector
        'msoa_sector_sex_allocations': {},  # Track allocated workers by MSOA, sector, and sex
        'sub_sector_assignments': {},  # Track sub-sector assignments
        'sub_sector_samples': []  # Sample of people with sub-sectors for display
    }

    # Create lookup dictionaries for efficient mapping
    self._create_geography_lookups()
    self._create_likelihood_lookups()
    self._process_workers_data()
    self._process_sex_industry_data()

    # Pre-compute capacity data structure for performance
    self._capacity_lookup = None
    self._world_msoas_cache = None

    # Initialize performance caches
    self._scottish_lad_cache = {}
    self._geography_lads_with_scottish_areas = None
    self._pre_build_lad_regional_cache()

    # Worker distribution optimization caches
    self._lad_msoa_probabilities_cache = {}  # Pre-computed MSOA selection probabilities by LAD
    self._capacity_arrays = {}  # Numpy arrays for fast capacity tracking

`distribute(areas, super_areas, population=None)`

Assign work locations and sectors to eligible people using LAD-based likelihood data.

Parameters:

Name	Type	Description	Default
`areas`	`Areas`		required
`super_areas`	`SuperAreas`		required
`population`	`Population`	(Default value = None)	`None`

Source code in june/distributors/worker_distributor_new.py

def distribute(self, areas: Areas, super_areas: SuperAreas, population: Population = None):
    """Assign work locations and sectors to eligible people using LAD-based likelihood data.

    Args:
        areas (Areas): 
        super_areas (SuperAreas): 
        population (Population, optional): (Default value = None)

    """
    self.areas = areas
    self.super_areas = super_areas

    # Cache world MSOAs for performance - compute once
    self._world_msoas_cache = set(super_area.name for super_area in self.super_areas.members)

    # Initialize worker tracking only for MSOAs that exist in this world
    self.allocated_workers_per_msoa = {msoa: 0 for msoa in self._world_msoas_cache 
                                     if msoa in self.msoa_employment_capacity}

    # Pre-compute capacity lookup for performance optimization
    self._build_capacity_lookup()

    # Pre-compute LAD MSOA selection probabilities (major optimization)
    self._build_lad_msoa_probabilities_cache()

    # Pre-compute global MSOA selection distribution for massive performance gain
    self._build_global_msoa_distribution()

    # Pre-compute LAD destination selections to eliminate expensive scipy rv.rvs() calls
    self._build_lad_destination_distributions()

    # Pre-compute industry probability matrices to eliminate expensive numpy operations per worker
    self._build_industry_probability_cache()

    # Set up lockdown status arrays
    lockdown_tags = np.array(["key_worker", "random", "furlough"])
    lockdown_tags_idx = np.arange(0, len(lockdown_tags))
    lockdown_tags_probabilities_by_sector = (
        self._parse_closure_probabilities_by_sector(
            company_closure=self.company_closure, lockdown_tags=lockdown_tags
        )
    )

    logger.info("Distributing workers to work locations...")

    # Pre-calculate total areas and workers for better progress logging
    total_areas = len(self.areas)
    total_eligible_workers = 0
    areas_with_workers = 0

    # First pass: count total eligible workers for progress tracking
    for area in self.areas:
        area_workers = sum(1 for person in area.people 
                         if person.primary_activity is None and self.age_range[0] <= person.age <= self.age_range[1])
        if area_workers > 0:
            areas_with_workers += 1
            total_eligible_workers += area_workers

    logger.info(f"Found {total_eligible_workers:,} eligible workers across {areas_with_workers:,}/{total_areas:,} areas")

    worker_samples = []
    processed_areas = 0
    processed_workers = 0

    for i, area in enumerate(iter(self.areas)):
        area_workers = sum(1 for person in area.people if person.primary_activity is None and self.age_range[0] <= person.age <= self.age_range[1])

        # Enhanced progress logging - more frequent for better feedback
        if i % 100 == 0:  # Log every 50 areas or areas with workers
            percent_complete = (i / total_areas) * 100
            worker_percent = (processed_workers / max(total_eligible_workers, 1)) * 100
            logger.info(f"Processing area {i+1:,}/{total_areas:,} ({percent_complete:.1f}%): {area.name} - {area_workers:,} eligible workers | Total processed: {processed_workers:,}/{total_eligible_workers:,} workers ({worker_percent:.1f}%)")

        if area_workers > 0:
            processed_areas += 1

        # Set up lockdown status lottery for this area
        self._lockdown_status_lottery(len(area.people))

        # Get LAD for this area
        area_lad = self._get_area_lad(area.name)

        for person in area.people:
            if person.primary_activity is not None:
                continue

            if self.age_range[0] <= person.age <= self.age_range[1]:
                # Assign work location using LAD-based likelihood
                # This also assigns the sector based on industry data
                self._assign_work_location_by_lad(person, area_lad)

                # Assign lockdown status
                self._assign_lockdown_status(
                    lockdown_tags_probabilities_by_sector,
                    lockdown_tags,
                    lockdown_tags_idx,
                    person,
                )

                # Track processed workers for progress logging
                processed_workers += 1

                # Collect sample data
                worker_samples.append({
                    "| Person ID": person.id,
                    "| Home Area": area.name,
                    "| Home LAD": area_lad,
                    "| Person Age": person.age,
                    "| Assigned Work Super Area": person.work_super_area.name if person.work_super_area else "No Assignment",
                    "| Assigned Work Sector": getattr(person, 'sector', None),
                    "| Work Mode": getattr(person, 'work_mode', None),
                    "| Lockdown Status": getattr(person, 'lockdown_status', None),
                })

    # Final completion message
    logger.info(f"Worker distribution completed: {processed_workers:,}/{total_eligible_workers:,} workers distributed across {processed_areas:,}/{areas_with_workers:,} areas with workers")

    if worker_samples:
        df_sample = pd.DataFrame(worker_samples).sample(n=min(10, len(worker_samples)))
        print("\n===== Sample of Workers Distributed Using LAD-based Likelihood =====")
        print(df_sample.to_string(index=False))

    # Print aggregated statistics summary
    #self._print_allocation_summary()

    logger.info(f"{len(worker_samples)} workers distributed.")

    # Clear caches to free memory
    self._world_msoas_cache = None
    self._capacity_lookup = None
    self._global_msoa_selections = None
    self._global_msoa_idx = None
    self._lad_destination_cache = None
    self._industry_probability_cache = None

`for_super_areas(area_names, config_file=default_config_file, policy_config_file=default_policy_config_file)` `classmethod`

Create WorkerDistributorNew for specific super areas

Parameters:

Name	Type	Description	Default
`area_names`	`List[str]`		required
`config_file`	`str`	(Default value = default_config_file)	`default_config_file`
`policy_config_file`	`str`	(Default value = default_policy_config_file)	`default_policy_config_file`

Source code in june/distributors/worker_distributor_new.py

@classmethod
def for_super_areas(
    cls,
    area_names: List[str],
    config_file: str = default_config_file,
    policy_config_file: str = default_policy_config_file,
) -> "WorkerDistributorNew":
    """Create WorkerDistributorNew for specific super areas

    Args:
        area_names (List[str]): 
        config_file (str, optional): (Default value = default_config_file)
        policy_config_file (str, optional): (Default value = default_policy_config_file)

    """
    return cls.from_file(
        area_names,
        config_file,
        policy_config_file,
    )

`from_file(area_names=None, config_file=default_config_file, policy_config_file=default_policy_config_file)` `classmethod`

Create WorkerDistributorNew from data files, automatically detecting which regional data to load based on area codes (E=England, W=Wales, S=Scotland)

Parameters:

Name	Type	Description	Default
`area_names`	`List[str]`	List of SuperArea names for which to initiate WorkerDistributorNew (Default value = None)	`None`
`config_file`	`str`	Configuration file with worker distributor settings (Default value = default_config_file)	`default_config_file`
`policy_config_file`	`str`	Policy configuration file with company closure settings (Default value = default_policy_config_file)	`default_policy_config_file`

Source code in june/distributors/worker_distributor_new.py

@classmethod
def from_file(
    cls,
    area_names: List[str] = None,
    config_file: str = default_config_file,
    policy_config_file: str = default_policy_config_file,
) -> "WorkerDistributorNew":
    """Create WorkerDistributorNew from data files, automatically detecting which regional
    data to load based on area codes (E=England, W=Wales, S=Scotland)

    Args:
        area_names (List[str], optional): List of SuperArea names for which to initiate WorkerDistributorNew (Default value = None)
        config_file (str, optional): Configuration file with worker distributor settings (Default value = default_config_file)
        policy_config_file (str, optional): Policy configuration file with company closure settings (Default value = default_policy_config_file)

    """
    area_names = area_names or []

    # Initialize empty dataframes
    likelihood_df = pd.DataFrame()
    workers_df = pd.DataFrame() 
    sex_industry_df = pd.DataFrame()
    geography_df = pd.DataFrame()

    # Determine which regions we need
    # IMPORTANT: For cross-border work assignments, we need to load ALL geography data
    # even if our world only contains areas from one region, because likelihood data
    # may reference LADs from other regions as work destinations
    if area_names:
        # Use provided area names to determine regions that have areas in our world
        area_codes = set(area_names)
        has_scotland_areas = any(area.startswith('S') for area in area_codes)
        has_england_wales_areas = any(area.startswith(('E', 'W')) for area in area_codes)
        has_northern_ireland_areas = any(area.startswith('N') for area in area_codes)

        # Always load all regional data to support cross-border assignments
        # People in one region may work in LADs from other regions
        need_scotland = True
        need_england_wales = True  
        need_northern_ireland = True

        logger.info(f"World contains: Scotland areas={has_scotland_areas}, England/Wales areas={has_england_wales_areas}, Northern Ireland areas={has_northern_ireland_areas}")
        logger.info("Loading all regional data to support cross-border work assignments")
    else:
        # If no area names provided, load all regions
        need_scotland = True
        need_england_wales = True
        need_northern_ireland = True

    logger.info(f"Data loading strategy: Scotland={need_scotland}, England/Wales={need_england_wales}, Northern Ireland={need_northern_ireland}")

    # Load Scottish data if needed
    if need_scotland:
        logger.info("Loading Scottish work data files...")
        likelihood_df = pd.concat([likelihood_df, pd.read_csv(default_likelihood_file)], ignore_index=True)
        workers_df = pd.concat([workers_df, pd.read_csv(default_workers_file)], ignore_index=True)
        sex_industry_df = pd.concat([sex_industry_df, pd.read_csv(default_sex_industry_file)], ignore_index=True)
        geography_df = pd.concat([geography_df, pd.read_csv(default_geography_file)], ignore_index=True)

    # Load England/Wales data if needed
    if need_england_wales:
        logger.info("Loading England/Wales work data files...")
        ew_likelihood_df = pd.read_csv(default_ew_likelihood_file)
        likelihood_df = pd.concat([likelihood_df, ew_likelihood_df], ignore_index=True)
        workers_df = pd.concat([workers_df, pd.read_csv(default_ew_workers_file)], ignore_index=True)
        sex_industry_df = pd.concat([sex_industry_df, pd.read_csv(default_ew_sex_industry_file)], ignore_index=True)
        geography_df = pd.concat([geography_df, pd.read_csv(default_ew_geography_file, low_memory=False)], ignore_index=True)


    # Load Northern Ireland data if needed
    if need_northern_ireland:
        logger.info("Loading Northern Ireland work data files...")
        try:
            ni_likelihood_df = pd.read_csv(default_ni_likelihood_file)
            likelihood_df = pd.concat([likelihood_df, ni_likelihood_df], ignore_index=True)
            workers_df = pd.concat([workers_df, pd.read_csv(default_ni_workers_file)], ignore_index=True)
            sex_industry_df = pd.concat([sex_industry_df, pd.read_csv(default_ni_sex_industry_file)], ignore_index=True)
            geography_df = pd.concat([geography_df, pd.read_csv(default_ni_geography_file)], ignore_index=True)
            logger.info("Successfully loaded Northern Ireland work data files")
        except FileNotFoundError as e:
            logger.warning(f"Northern Ireland work data file not found: {e}")
            logger.info("Continuing without Northern Ireland work data - NI workers will be handled as out-of-scope")

    # Load config files
    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)

    with open(policy_config_file, 'r') as f:
        policy_config = yaml.safe_load(f)

    return cls(
        likelihood_df=likelihood_df,
        geography_df=geography_df,
        workers_df=workers_df,
        sex_industry_df=sex_industry_df,
        company_closure=policy_config["company_closure"]["sectors"],
        age_range=config['age_range'],
        sub_sector_ratio=config['sub_sector_ratio'],
        sub_sector_distr=config['sub_sector_distr'],
        non_geographical_work_location=config['non_geographical_work_location']
    )

`create_worker_distributor_new(config_file=default_config_file, policy_config_file=default_policy_config_file)`

Factory function to create WorkerDistributorNew with automatic regional data detection

Parameters:

Name	Type	Description	Default
`config_file`	`str`	(Default value = default_config_file)	`default_config_file`
`policy_config_file`	`str`	(Default value = default_policy_config_file)	`default_policy_config_file`

Source code in june/distributors/worker_distributor_new.py

def create_worker_distributor_new(
    config_file: str = default_config_file,
    policy_config_file: str = default_policy_config_file,
):
    """Factory function to create WorkerDistributorNew with automatic regional data detection

    Args:
        config_file (str, optional): (Default value = default_config_file)
        policy_config_file (str, optional): (Default value = default_policy_config_file)

    """
    return WorkerDistributorNew.from_file(
        area_names=None,
        config_file=config_file,
        policy_config_file=policy_config_file,
    )

Worker distributor new