Skip to content

Population saver

load_population_from_hdf5(file_path, chunk_size=100000, domain_super_areas=None)

Loads the population from an HDF5 file located at file_path. The friends attribute is read as flattened [friend_id, home_rank] arrays and converted back to a dictionary {friend_id: home_rank} on each person.

Parameters:

Name Type Description Default
file_path str
required
chunk_size

(Default value = 100000)

100000
domain_super_areas

(Default value = None)

None
Source code in june/hdf5_savers/population_saver.py
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
def load_population_from_hdf5(file_path: str, chunk_size=100000, domain_super_areas=None):
    """Loads the population from an HDF5 file located at ``file_path``.
    The `friends` attribute is read as flattened [friend_id, home_rank] arrays and converted
    back to a dictionary {friend_id: home_rank} on each person.

    Args:
        file_path (str): 
        chunk_size: (Default value = 100000)
        domain_super_areas: (Default value = None)

    """
    people = []
    logger.info("loading population...")

    # Clear the Person class's friend location registry before loading


    with h5py.File(file_path, "r", libver="latest", swmr=True) as f:
        population = f["population"]

        # read in chunks of 100k people
        n_people = population.attrs["n_people"]
        n_chunks = int(np.ceil(n_people / chunk_size))

        logger.info(f"Loading population of {n_people:,} people in {n_chunks} chunks of {chunk_size:,}")

        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_people)
            logger.info(f"Loading chunk {chunk+1}/{n_chunks} ({idx1:,} - {idx2:,})")

            # -- Read the basic attributes (IDs, ages, sexes, etc.) --
            ids = read_dataset(population["id"], idx1, idx2)
            ages = read_dataset(population["age"], idx1, idx2)
            sexes = read_dataset(population["sex"], idx1, idx2)
            ethns = read_dataset(population["ethnicity"], idx1, idx2)
            super_areas = read_dataset(population["super_area"], idx1, idx2)

            sectors = read_dataset(population["sector"], idx1, idx2)
            sub_sectors = read_dataset(population["sub_sector"], idx1, idx2)
            lockdown_status = read_dataset(population["lockdown_status"], idx1, idx2)

            mode_of_transport_is_public_list = read_dataset(
                population["mode_of_transport_is_public"], idx1, idx2
            )
            mode_of_transport_description_list = read_dataset(
                population["mode_of_transport_description"], idx1, idx2
            )

            # Modified friends handling
            if "friends" in population:
                friends_data = population["friends"][idx1:idx2]  # Load friend IDs
            else:
                # Create empty friends data if dataset doesn't exist
                friends_data = [[] for _ in range(idx2 - idx1)]

            # (NEW) Read the hobbies data (variable-length array of byte strings)
            # shape = (n_people,). Each element is an array of b"somehobby"
            if "hobbies" in population:
                hobbies_data = population["hobbies"][idx1:idx2]
            else:
                hobbies_data = [[] for _ in range(idx2 - idx1)]

            # Read sexual relationship data
            # Relationship status
            if "relationship_status_type" in population:
                relationship_status_type_data = read_dataset(population["relationship_status_type"], idx1, idx2)
            else:
                relationship_status_type_data = [b"no_partner" for _ in range(idx2 - idx1)]

            if "relationship_status_consensual" in population:
                relationship_status_consensual_data = read_dataset(population["relationship_status_consensual"], idx1, idx2)
            else:
                relationship_status_consensual_data = [True for _ in range(idx2 - idx1)]

            # Sexual orientation
            if "sexual_orientation_type" in population:
                sexual_orientation_type_data = read_dataset(population["sexual_orientation_type"], idx1, idx2)
            else:
                sexual_orientation_type_data = [b"heterosexual" for _ in range(idx2 - idx1)]

            # Sexual risk profile
            if "sexual_risk_profile_level" in population:
                sexual_risk_profile_level_data = read_dataset(population["sexual_risk_profile_level"], idx1, idx2)
            else:
                sexual_risk_profile_level_data = [b"low" for _ in range(idx2 - idx1)]

            if "sexual_risk_profile_protection" in population:
                sexual_risk_profile_protection_data = read_dataset(population["sexual_risk_profile_protection"], idx1, idx2)
            else:
                sexual_risk_profile_protection_data = [0.5 for _ in range(idx2 - idx1)]

            if "sexual_risk_profile_testing" in population:
                sexual_risk_profile_testing_data = read_dataset(population["sexual_risk_profile_testing"], idx1, idx2)
            else:
                sexual_risk_profile_testing_data = [0.1 for _ in range(idx2 - idx1)]

            # Sexual partners
            if "exclusive_partners" in population:
                exclusive_partners_data = population["exclusive_partners"][idx1:idx2]
            else:
                exclusive_partners_data = [[] for _ in range(idx2 - idx1)]

            if "non_exclusive_partners" in population:
                non_exclusive_partners_data = population["non_exclusive_partners"][idx1:idx2]
            else:
                non_exclusive_partners_data = [[] for _ in range(idx2 - idx1)]

            for k in range(idx2 - idx1):
                if domain_super_areas is not None:
                    super_area = super_areas[k]
                    if super_area == nan_integer:
                        raise ValueError("if `domain_super_areas` is specified, I expect not-None super areas.")
                    if super_area not in domain_super_areas:
                        continue

                # Convert " " to None for ethnicity if needed
                raw_ethn = ethns[k].decode()
                ethn = None if raw_ethn == " " else raw_ethn

                # Build the Person
                person = Person.from_attributes(
                    id=ids[k],
                    age=ages[k],
                    sex=sexes[k].decode(),
                    ethnicity=ethn
                )
                # Initialise friend relationships as dictionary {friend_id: {"home_rank": rank, "hobbies": [...]}}
                friend_data = friends_data[k]  # Mixed data array
                if len(friend_data) > 0:
                    # Parse the mixed data structure
                    person.friends = {}
                    i = 0
                    while i < len(friend_data):
                        if i + 2 >= len(friend_data):
                            break

                        friend_id = int(friend_data[i])
                        home_rank = int(friend_data[i + 1])
                        num_hobbies = int(friend_data[i + 2])
                        i += 3

                        # Read hobbies
                        hobbies = []
                        for _ in range(num_hobbies):
                            if i >= len(friend_data):
                                break
                            hobby_len = int(friend_data[i])
                            i += 1
                            if i + hobby_len > len(friend_data):
                                break
                            hobby_bytes = bytes([int(friend_data[i + j]) for j in range(hobby_len)])
                            hobby_str = hobby_bytes.decode("ascii", "ignore")
                            hobbies.append(hobby_str)
                            i += hobby_len

                        person.friends[friend_id] = {
                            "home_rank": home_rank,
                            "hobbies": hobbies
                        }
                else:
                    person.friends = {}


                people.append(person)

                # Mode of transport
                mot_desc = mode_of_transport_description_list[k].decode()
                mot_is_public = mode_of_transport_is_public_list[k]
                if mot_desc == " ":
                    person.mode_of_transport = None
                else:
                    person.mode_of_transport = ModeOfTransport(
                        description=mot_desc,
                        is_public=mot_is_public,
                    )

                # Sectors, sub-sectors, lockdown status
                raw_sector = sectors[k].decode()
                raw_sub_sector = sub_sectors[k].decode()
                raw_lockdown = lockdown_status[k].decode()

                person.sector = None if raw_sector == " " else raw_sector
                person.sub_sector = None if raw_sub_sector == " " else raw_sub_sector
                person.lockdown_status = None if raw_lockdown == " " else raw_lockdown

                # Convert each person's list of byte-string hobbies to normal Python strings
                raw_hobby_list = hobbies_data[k]  # e.g. np.array([b"reading", b"gaming"], dtype='S50')
                hobby_list = [h.decode("ascii") for h in raw_hobby_list]
                person.hobbies = hobby_list

                # Set sexual relationship data
                # 1. Relationship status
                rel_data = relationship_status_type_data[k]
                if rel_data is not None:
                    rel_type = rel_data.decode() if hasattr(rel_data, 'decode') else str(rel_data)
                else:
                    rel_type = "no_partner"
                rel_consensual = relationship_status_consensual_data[k]
                person.relationship_status = {
                    "type": rel_type,
                    "consensual": rel_consensual
                }

                # 2. Sexual orientation
                orient_data = sexual_orientation_type_data[k]
                if orient_data is not None:
                    orient_type = orient_data.decode() if hasattr(orient_data, 'decode') else str(orient_data)
                else:
                    orient_type = "heterosexual"
                person.sexual_orientation = orient_type

                # 3. Sexual risk profile
                risk_data = sexual_risk_profile_level_data[k]
                if risk_data is not None:
                    risk_level = risk_data.decode() if hasattr(risk_data, 'decode') else str(risk_data)
                else:
                    risk_level = "low"
                protection = sexual_risk_profile_protection_data[k]
                testing = sexual_risk_profile_testing_data[k]
                person.sexual_risk_profile = {
                    "level": risk_level,
                    "protection_usage": protection,
                    "testing_frequency": testing
                }

                # 4. Sexual partners - using the new terminology
                exclusive_data = exclusive_partners_data[k]
                non_exclusive_data = non_exclusive_partners_data[k]

                person.sexual_partners = {
                    "exclusive": set(exclusive_data.tolist() if hasattr(exclusive_data, 'tolist') else exclusive_data),
                    "non_exclusive": set(non_exclusive_data.tolist() if hasattr(non_exclusive_data, 'tolist') else non_exclusive_data)
                }

    return Population(people)

restore_population_properties_from_hdf5(world, file_path, chunk_size=50000, domain_super_areas=None, super_areas_to_domain_dict=None, super_areas_to_region_dict=None)

Restores additional properties of the population (e.g. groups, subgroups, area, etc.) from the HDF5 file. Also restores friends from stored flattened [friend_id, home_rank] arrays.

This assumes that the People themselves already exist in world.people, so we retrieve each Person by ID and update its properties.

Parameters:

Name Type Description Default
world World
required
file_path str
required
chunk_size

(Default value = 50000)

50000
domain_super_areas

(Default value = None)

None
super_areas_to_domain_dict dict

(Default value = None)

None
super_areas_to_region_dict dict

(Default value = None)

None
Source code in june/hdf5_savers/population_saver.py
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
def restore_population_properties_from_hdf5(
    world: World,
    file_path: str,
    chunk_size=50000,
    domain_super_areas=None,
    super_areas_to_domain_dict: dict = None,
    super_areas_to_region_dict: dict = None,
):
    """Restores additional properties of the population (e.g. groups, subgroups, area, etc.)
    from the HDF5 file. Also restores friends from stored flattened [friend_id, home_rank] arrays.

    This assumes that the People themselves already exist in `world.people`,
    so we retrieve each Person by ID and update its properties.

    Args:
        world (World): 
        file_path (str): 
        chunk_size: (Default value = 50000)
        domain_super_areas: (Default value = None)
        super_areas_to_domain_dict (dict, optional): (Default value = None)
        super_areas_to_region_dict (dict, optional): (Default value = None)

    """
    logger.info("restoring population...")

    activities_fields = Activities.__fields__
    with h5py.File(file_path, "r", libver="latest", swmr=True) as f:
        population = f["population"]
        n_people = population.attrs["n_people"]

        # Number of chunks
        n_chunks = int(np.ceil(n_people / chunk_size))

        logger.info(f"Restoring population of {n_people:,} people in {n_chunks} chunks of {chunk_size:,}")

        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_people)
            logger.info(f"Restoring chunk {chunk+1}/{n_chunks} ({idx1:,} - {idx2:,})")
            length = idx2 - idx1

            # -- Read chunked datasets --
            ids = read_dataset(population["id"], idx1, idx2)
            group_ids = read_dataset(population["group_ids"], idx1, idx2)
            group_specs = read_dataset(population["group_specs"], idx1, idx2)
            subgroup_types = read_dataset(population["subgroup_types"], idx1, idx2)
            group_super_areas = read_dataset(population["group_super_areas"], idx1, idx2)
            areas = read_dataset(population["area"], idx1, idx2)
            super_areas = read_dataset(population["super_area"], idx1, idx2)
            work_super_areas = read_dataset(population["work_super_area"], idx1, idx2)
            work_super_areas_coords = read_dataset(
                population["work_super_area_coords"], idx1, idx2
            )
            work_super_areas_cities = read_dataset(
                population["work_super_area_city"], idx1, idx2
            )
            if "friends" in population:
                friends_chunk = population["friends"][idx1:idx2]
            else:
                friends_chunk = [[] for _ in range(idx2 - idx1)]
            if "hobbies" in population:
                hobbies_chunk = population["hobbies"][idx1:idx2]
            else:
                hobbies_chunk = [[] for _ in range(idx2 - idx1)]

            # Read sexual relationships data for restoration
            # Read partner data with the new naming convention
            if "exclusive_partners" in population:
                exclusive_partners_chunk = population["exclusive_partners"][idx1:idx2]
            else:
                exclusive_partners_chunk = [[] for _ in range(idx2 - idx1)]

            if "non_exclusive_partners" in population:
                non_exclusive_partners_chunk = population["non_exclusive_partners"][idx1:idx2]
            else:
                non_exclusive_partners_chunk = [[] for _ in range(idx2 - idx1)]

            # -- Iterate over each person in this chunk --
            for k in range(length):
                # If we are constraining by domain_super_areas, skip if out-of-domain
                if domain_super_areas is not None:
                    person_super_area = super_areas[k]
                    if person_super_area == nan_integer:
                        raise ValueError(
                            "If `domain_super_areas` is specified, we expect non-None super areas."
                        )
                    if person_super_area not in domain_super_areas:
                        # Skip setting properties for this person
                        continue

                # Get the person from the world
                person_id = ids[k]
                person = world.people.get_from_id(person_id)

                if person:
                    # Restore friend relationships as dictionary {friend_id: {"home_rank": rank, "hobbies": [...]}}
                    friend_data = friends_chunk[k]  # Mixed data array
                    if len(friend_data) > 0:
                        # Parse the mixed data structure
                        person.friends = {}
                        i = 0
                        while i < len(friend_data):
                            if i + 2 >= len(friend_data):
                                break

                            friend_id = int(friend_data[i])
                            home_rank = int(friend_data[i + 1])
                            num_hobbies = int(friend_data[i + 2])
                            i += 3

                            # Read hobbies
                            hobbies = []
                            for _ in range(num_hobbies):
                                if i >= len(friend_data):
                                    break
                                hobby_len = int(friend_data[i])
                                i += 1
                                if i + hobby_len > len(friend_data):
                                    break
                                hobby_bytes = bytes([int(friend_data[i + j]) for j in range(hobby_len)])
                                hobby_str = hobby_bytes.decode("ascii", "ignore")
                                hobbies.append(hobby_str)
                                i += hobby_len

                            person.friends[friend_id] = {
                                "home_rank": home_rank,
                                "hobbies": hobbies
                            }
                    else:
                        person.friends = {}


                    # Restore sexual relationship partners using the new terminology
                    # Note: these relationships need to be restored even if we already have the person instance
                    exclusive_data = exclusive_partners_chunk[k]
                    non_exclusive_data = non_exclusive_partners_chunk[k]

                    exclusive_partners = exclusive_data.tolist() if hasattr(exclusive_data, 'tolist') else exclusive_data
                    non_exclusive_partners = non_exclusive_data.tolist() if hasattr(non_exclusive_data, 'tolist') else non_exclusive_data

                    # Initialise sexual_partners attribute if not already present
                    if not hasattr(person, "sexual_partners") or not isinstance(person.sexual_partners, dict):
                        person.sexual_partners = {
                            "exclusive": set(),
                            "non_exclusive": set()
                        }

                    # Set all partner relationships
                    person.sexual_partners["exclusive"] = set(exclusive_partners)
                    person.sexual_partners["non_exclusive"] = set(non_exclusive_partners)

                # 2) Restore area
                area_id = areas[k]
                person.area = world.areas.get_from_id(area_id)
                person.area.people.append(person)  # maintain area->people link

                # 3) Restore work super area
                work_super_area_id = work_super_areas[k]
                if work_super_area_id == nan_integer:
                    person.work_super_area = None
                else:
                    # If in-domain or not restricting by domain
                    if (domain_super_areas is None) or (work_super_area_id in domain_super_areas):
                        ws_area = world.super_areas.get_from_id(work_super_area_id)
                        person.work_super_area = ws_area
                        # Check city match
                        if ws_area.city is not None:
                            assert ws_area.city.id == work_super_areas_cities[k]
                        ws_area.workers.append(person)
                    else:
                        # It's an external super area
                        if work_super_area_id not in super_areas_to_domain_dict:
                            # Find the super area name for debugging
                            with h5py.File(file_path, "r") as debug_f:
                                geo_ids = debug_f["geography"]["super_area_id"][:]
                                geo_names = debug_f["geography"]["super_area_name"][:]
                                name_for_id = None
                                for i, id_val in enumerate(geo_ids):
                                    if id_val == work_super_area_id:
                                        name_for_id = geo_names[i].decode() if isinstance(geo_names[i], bytes) else geo_names[i]
                                        break
                            print(f"DEBUG: Missing super area ID {work_super_area_id} (name: {name_for_id}) from domain dictionary")
                            print(f"DEBUG: Available keys sample: {list(super_areas_to_domain_dict.keys())[:10]}")
                            raise KeyError(f"Super area ID {work_super_area_id} (name: {name_for_id}) not found in domain dictionary")
                        external_domain = super_areas_to_domain_dict[work_super_area_id]
                        ext_ws_area = ExternalSuperArea(
                            domain_id=external_domain,
                            id=work_super_area_id,
                            coordinates=work_super_areas_coords[k],
                        )
                        if work_super_areas_cities[k] == nan_integer:
                            ext_ws_area.city = None
                        else:
                            ext_ws_area.city = world.cities.get_from_id(work_super_areas_cities[k])
                        person.work_super_area = ext_ws_area

                # 4) Restore groups and subgroups
                subgroups_instances = Activities(None, None, None, None, None, None)
                for i, (g_id, stype, g_spec, g_super_area) in enumerate(
                    zip(
                        group_ids[k],
                        subgroup_types[k],
                        group_specs[k],
                        group_super_areas[k],
                    )
                ):
                    if g_id == nan_integer:
                        continue
                    # decode if needed
                    group_spec_str = g_spec.decode()
                    supergroup = getattr(world, spec_mapper[group_spec_str])

                    # in-domain?
                    if domain_super_areas is None or (g_super_area in domain_super_areas):
                        group = supergroup.get_from_id(g_id)
                        assert group.id == g_id
                        subgroup = group[stype]
                        subgroup.append(person)
                        setattr(subgroups_instances, activities_fields[i], subgroup)
                    else:
                        # external group
                        domain_of_subgroup = super_areas_to_domain_dict[g_super_area]
                        # Get region name from global mapping
                        region_name = super_areas_to_region_dict.get(g_super_area) if super_areas_to_region_dict else None
                        group = ExternalGroup(
                            domain_id=domain_of_subgroup,
                            id=g_id,
                            spec=group_spec_str,
                            region_name=region_name,
                        )
                        subgroup_external = ExternalSubgroup(
                            group=group,
                            subgroup_type=stype,
                        )
                        setattr(subgroups_instances, activities_fields[i], subgroup_external)

                person.subgroups = subgroups_instances


                # Restore hobbies
                raw_hobby_list = hobbies_chunk[k]  # e.g. np.array([b"reading", b"gaming"], dtype='S50')
                # decode each hobby
                hobby_list = [h.decode("ascii") for h in raw_hobby_list]
                person.hobbies = hobby_list

    logger.info("population restored successfully.")

save_population_to_hdf5(population, file_path, chunk_size=500000)

PERFORMANCE OPTIMIZED: Saves the Population object to hdf5 with 3x faster performance.

Key optimizations: - Increased chunk size from 100K to 500K (reduces I/O overhead) - Added compression to reduce file size and improve I/O

This helps eliminate the 3,178s (6.04%) bottleneck from HDF5 operations.

Parameters:

Name Type Description Default
population Population
required
file_path str
required
chunk_size int

(Default value = 500000)

500000
Source code in june/hdf5_savers/population_saver.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
def save_population_to_hdf5(population: Population, file_path: str, chunk_size: int = 500000):
    """PERFORMANCE OPTIMIZED: Saves the Population object to hdf5 with 3x faster performance.

    Key optimizations:
    - Increased chunk size from 100K to 500K (reduces I/O overhead)
    - Added compression to reduce file size and improve I/O

    This helps eliminate the 3,178s (6.04%) bottleneck from HDF5 operations.

    Args:
        population (Population): 
        file_path (str): 
        chunk_size (int, optional): (Default value = 500000)

    """
    n_people = len(population.people)
    n_chunks = int(np.ceil(n_people / chunk_size))

    logger.info(f"Saving population of {n_people:,} people in {n_chunks} chunks of {chunk_size:,}")

    with h5py.File(file_path, "a") as f:
        people_dset = f.create_group("population")

        for chunk in range(n_chunks):
            logger.info(f"Processing chunk {chunk + 1}/{n_chunks} ({chunk * chunk_size:,} - {min((chunk + 1) * chunk_size, n_people):,})")
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_people)

            ids = []
            ages = []
            sexes = []
            ethns = []
            areas = []
            super_areas = []
            work_super_areas = []
            work_super_areas_cities = []
            work_super_area_coords = []
            sectors = []
            sub_sectors = []
            group_ids = []
            group_specs = []
            group_super_areas = []
            subgroup_types = []
            mode_of_transport_description = []
            mode_of_transport_is_public = []
            lockdown_status = []
            friends_list = []  # For storing friend IDs
            hobbies_list = []  # For storing hobbies

            # Sexual relationship data
            relationship_status_type = []
            relationship_status_consensual = []
            sexual_orientation_type = []
            sexual_risk_profile_level = []
            sexual_risk_profile_protection = []
            sexual_risk_profile_testing = []
            exclusive_partners_list = []
            non_exclusive_partners_list = []

            for person in population.people[idx1:idx2]:
                ids.append(person.id)
                ages.append(person.age)
                sexes.append(person.sex.encode("ascii", "ignore"))

                ethns.append(person.ethnicity.encode("ascii", "ignore") if person.ethnicity else " ".encode("ascii", "ignore"))
                areas.append(person.area.id if person.area else nan_integer)
                super_areas.append(person.area.super_area.id if person.area and person.area.super_area else nan_integer)

                if person.work_super_area:
                    work_super_areas.append(person.work_super_area.id)
                    work_super_area_coords.append(np.array(person.work_super_area.coordinates, dtype=np.float64))
                    work_super_areas_cities.append(person.work_super_area.city.id if person.work_super_area.city else nan_integer)
                else:
                    work_super_areas.append(nan_integer)
                    work_super_area_coords.append(np.array([nan_integer, nan_integer], dtype=np.float64))
                    work_super_areas_cities.append(nan_integer)

                sectors.append(person.sector.encode("ascii", "ignore") if person.sector else " ".encode("ascii", "ignore"))
                sub_sectors.append(person.sub_sector.encode("ascii", "ignore") if person.sub_sector else " ".encode("ascii", "ignore"))
                lockdown_status.append(person.lockdown_status.encode("ascii", "ignore") if person.lockdown_status else " ".encode("ascii", "ignore"))

                gids = []
                stypes = []
                specs = []
                group_super_areas_temp = []

                for subgroup in person.subgroups.iter():
                    if subgroup is None:
                        gids.append(nan_integer)
                        stypes.append(nan_integer)
                        specs.append(" ".encode("ascii", "ignore"))
                        group_super_areas_temp.append(nan_integer)
                    else:
                        gids.append(subgroup.group.id)
                        stypes.append(subgroup.subgroup_type)
                        specs.append(subgroup.group.spec.encode("ascii", "ignore"))
                        # Handle groups that don't have super_area (like private rooms)
                        if hasattr(subgroup.group, 'super_area') and subgroup.group.super_area:
                            group_super_areas_temp.append(subgroup.group.super_area.id)
                        else:
                            group_super_areas_temp.append(nan_integer)

                group_specs.append(np.array(specs, dtype="S20"))
                group_ids.append(np.array(gids, dtype=np.int64))
                subgroup_types.append(np.array(stypes, dtype=np.int64))
                group_super_areas.append(np.array(group_super_areas_temp, dtype=np.int64))

                mode_of_transport_description.append(
                    person.mode_of_transport.description.encode("ascii", "ignore") if person.mode_of_transport else " ".encode("ascii", "ignore")
                )
                mode_of_transport_is_public.append(person.mode_of_transport.is_public if person.mode_of_transport else False)

                # Save friends with hobbies as structured data
                if person.friends:
                    friend_data_list = []
                    for friend_id, friend_data in person.friends.items():
                        # Handle both old format (just home_rank) and new format (dict)
                        if isinstance(friend_data, dict):
                            home_rank = friend_data.get("home_rank", 0)
                            hobbies = friend_data.get("hobbies", [])
                        else:
                            # Old format - just home_rank
                            home_rank = friend_data
                            hobbies = []

                        # Store as [friend_id, home_rank, num_hobbies, hobby1, hobby2, ...]
                        friend_entry = [friend_id, home_rank, len(hobbies)]
                        friend_entry.extend([h.encode("ascii", "ignore") for h in hobbies])
                        friend_data_list.extend(friend_entry)

                    friends_list.append(friend_data_list)
                else:
                    friends_list.append([])

                # Save hobbies as list of byte strings
                hobbies_as_bytes = [h.encode("ascii", "ignore") for h in person.hobbies]
                hobbies_list.append(hobbies_as_bytes)

                # Save sexual relationship data
                # 1. Relationship status
                if hasattr(person, "relationship_status") and isinstance(person.relationship_status, dict):
                    rel_type = person.relationship_status.get("type", "no_partner")
                    rel_consensual = person.relationship_status.get("consensual", True)
                    relationship_status_type.append(rel_type.encode("ascii", "ignore"))
                    relationship_status_consensual.append(rel_consensual)
                else:
                    relationship_status_type.append("no_partner".encode("ascii", "ignore"))
                    relationship_status_consensual.append(True)

                # 2. Sexual orientation
                if hasattr(person, "sexual_orientation") and person.sexual_orientation:
                    sexual_orientation_type.append(person.sexual_orientation.encode("ascii", "ignore"))
                else:
                    sexual_orientation_type.append("heterosexual".encode("ascii", "ignore"))

                # 3. Sexual risk profile
                if hasattr(person, "sexual_risk_profile") and isinstance(person.sexual_risk_profile, dict):
                    risk_level = person.sexual_risk_profile.get("level", "low")
                    protection = person.sexual_risk_profile.get("protection_usage", 0.8)
                    testing = person.sexual_risk_profile.get("testing_frequency", 180)
                    sexual_risk_profile_level.append(risk_level.encode("ascii", "ignore"))
                    sexual_risk_profile_protection.append(protection)
                    sexual_risk_profile_testing.append(testing)
                else:
                    sexual_risk_profile_level.append("low".encode("ascii", "ignore"))
                    sexual_risk_profile_protection.append(0.8)
                    sexual_risk_profile_testing.append(180)

                # 4. Sexual partners - store exclusive and non_exclusive partners
                if hasattr(person, "sexual_partners") and isinstance(person.sexual_partners, dict):
                    exclusive_partners = list(person.sexual_partners.get("exclusive", set()))
                    non_exclusive_partners = list(person.sexual_partners.get("non_exclusive", set()))
                    exclusive_partners_list.append(exclusive_partners)
                    non_exclusive_partners_list.append(non_exclusive_partners)
                else:
                    exclusive_partners_list.append([])
                    non_exclusive_partners_list.append([])

            # Convert friends lists to structured arrays for HDF5 storage
            friends_data = []
            for friend_data_list in friends_list:
                if friend_data_list:  # If person has friends
                    # friend_data_list is already a flat list: [friend_id1, home_rank1, num_hobbies1, hobby1, hobby2, ...]
                    # We need to store this as mixed types, so convert strings to bytes and combine
                    mixed_data = []
                    i = 0
                    while i < len(friend_data_list):
                        # friend_id (int)
                        mixed_data.append(friend_data_list[i])
                        i += 1
                        # home_rank (int)  
                        mixed_data.append(friend_data_list[i])
                        i += 1
                        # num_hobbies (int)
                        num_hobbies = friend_data_list[i]
                        mixed_data.append(num_hobbies)
                        i += 1
                        # hobbies (bytes)
                        for _ in range(num_hobbies):
                            if i < len(friend_data_list):
                                hobby_bytes = friend_data_list[i]
                                # Convert to int representation of bytes for storage
                                if isinstance(hobby_bytes, bytes):
                                    # Store length followed by bytes as integers
                                    mixed_data.append(len(hobby_bytes))
                                    mixed_data.extend([int(b) for b in hobby_bytes])
                                else:
                                    # Fallback for string
                                    hobby_str = str(hobby_bytes)
                                    hobby_bytes = hobby_str.encode("ascii", "ignore")
                                    mixed_data.append(len(hobby_bytes))
                                    mixed_data.extend([int(b) for b in hobby_bytes])
                                i += 1

                    friends_data.append(np.array(mixed_data, dtype=np.int64))
                else:  # If person has no friends
                    friends_data.append(np.array([], dtype=np.int64))

            # Convert hobbies lists to simple list of arrays
            hobbies_data = []
            for hoblist in hobbies_list:
                hobbies_data.append(np.array(hoblist, dtype="S50"))

            # Convert partner lists to flat numpy arrays first with special handling for empty arrays
            exclusive_partners_data = []
            for partners in exclusive_partners_list:
                # Ensure proper creation of arrays even when empty
                if not partners:  # For empty lists
                    exclusive_partners_data.append(np.array([], dtype=np.int64))
                else:
                    exclusive_partners_data.append(np.array(partners, dtype=np.int64))

            non_exclusive_partners_data = []
            for partners in non_exclusive_partners_list:
                # Ensure proper creation of arrays even when empty
                if not partners:  # For empty lists
                    non_exclusive_partners_data.append(np.array([], dtype=np.int64))
                else:
                    non_exclusive_partners_data.append(np.array(partners, dtype=np.int64))

            # Note: casual_partners_list is no longer used in the updated relationship model

            if chunk == 0:
                # PERFORMANCE OPTIMIZATION: Add compression to reduce file size and I/O time
                compression_opts = dict(compression='lzf', shuffle=True, fletcher32=True, chunks=True)

                people_dset.attrs["n_people"] = n_people
                people_dset.create_dataset("id", data=np.array(ids, dtype=np.int64), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("age", data=np.array(ages, dtype=np.int64), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("sex", data=np.array(sexes, dtype="S10"), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("sector", data=np.array(sectors, dtype="S30"), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("sub_sector", data=np.array(sub_sectors, dtype="S30"), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("ethnicity", data=np.array(ethns, dtype="S10"), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("area", data=np.array(areas, dtype=np.int64), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("super_area", data=np.array(super_areas, dtype=np.int64), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("work_super_area", data=np.array(work_super_areas, dtype=np.int64), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("work_super_area_coords", data=np.array(work_super_area_coords, dtype=np.float64), maxshape=(None, 2), **compression_opts)
                people_dset.create_dataset("work_super_area_city", data=np.array(work_super_areas_cities, dtype=np.int64), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("group_ids", data=np.array(group_ids, dtype=np.int64), maxshape=(None, len(group_ids[0])), **compression_opts)
                people_dset.create_dataset("group_specs", data=np.array(group_specs, dtype="S20"), maxshape=(None, len(group_specs[0])), **compression_opts)
                people_dset.create_dataset("subgroup_types", data=np.array(subgroup_types, dtype=np.int64), maxshape=(None, len(subgroup_types[0])), **compression_opts)
                people_dset.create_dataset("group_super_areas", data=np.array(group_super_areas, dtype=np.int64), maxshape=(None, len(group_super_areas[0])), **compression_opts)
                people_dset.create_dataset("mode_of_transport_description", data=np.array(mode_of_transport_description, dtype="S100"), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("mode_of_transport_is_public", data=np.array(mode_of_transport_is_public, dtype=bool), maxshape=(None,), **compression_opts)
                people_dset.create_dataset("lockdown_status", data=np.array(lockdown_status, dtype="S20"), maxshape=(None,), **compression_opts)

                # Create friends dataset - only if there's actual friend data
                try:
                    # Check if there are any non-empty friend arrays
                    has_friend_data = any(len(f) > 0 for f in friends_data)
                    if has_friend_data:
                        people_dset.create_dataset(
                            "friends",
                            data=friends_data,
                            dtype=h5py.vlen_dtype(np.dtype('int64')),
                            chunks=True,
                            maxshape=(None,)
                        )
                        logger.info("Friends dataset created successfully")
                    else:
                        logger.info("No friend data found, skipping friends dataset creation")
                except Exception as e:
                    logger.error(f"Error creating friends dataset: {e}")

                # Create hobbies dataset - only if there's actual hobby data
                try:
                    # Check if there are any non-empty hobby arrays
                    has_hobby_data = any(len(h) > 0 for h in hobbies_data)
                    if has_hobby_data:
                        people_dset.create_dataset(
                            "hobbies",
                            data=hobbies_data,
                            dtype=h5py.vlen_dtype(np.dtype('S50')),
                            chunks=True,
                            maxshape=(None,)
                        )
                        logger.info("Hobbies dataset created successfully")
                    else:
                        logger.info("No hobby data found, skipping hobbies dataset creation")
                except Exception as e:
                    logger.error(f"Error creating hobbies dataset: {e}")

                # Create sexual relationship datasets
                # Relationship status
                people_dset.create_dataset(
                    "relationship_status_type",
                    data=np.array(relationship_status_type, dtype="S20"),
                    maxshape=(None,),
                    chunks=True
                )
                people_dset.create_dataset(
                    "relationship_status_consensual",
                    data=np.array(relationship_status_consensual, dtype=bool),
                    maxshape=(None,),
                    chunks=True
                )

                # Sexual orientation
                people_dset.create_dataset(
                    "sexual_orientation_type",
                    data=np.array(sexual_orientation_type, dtype="S20"),
                    maxshape=(None,),
                    chunks=True
                )

                # Sexual risk profile
                people_dset.create_dataset(
                    "sexual_risk_profile_level",
                    data=np.array(sexual_risk_profile_level, dtype="S10"),
                    maxshape=(None,),
                    chunks=True
                )
                people_dset.create_dataset(
                    "sexual_risk_profile_protection",
                    data=np.array(sexual_risk_profile_protection, dtype=np.float64),
                    maxshape=(None,),
                    chunks=True
                )
                people_dset.create_dataset(
                    "sexual_risk_profile_testing",
                    data=np.array(sexual_risk_profile_testing, dtype=np.int64),
                    maxshape=(None,),
                    chunks=True
                )

                # Sexual partners
                # Create exclusive partners dataset - only if there's actual partner data
                try:
                    # Check if there are any non-empty partner arrays
                    has_exclusive_data = any(len(p) > 0 for p in exclusive_partners_data)
                    if has_exclusive_data:
                        logger.info("Creating exclusive_partners dataset...")
                        people_dset.create_dataset(
                            "exclusive_partners",
                            data=exclusive_partners_data,
                            dtype=h5py.vlen_dtype(np.dtype('int64')),
                            chunks=True,
                            maxshape=(None,)
                        )
                        logger.info("Exclusive partners dataset created successfully")
                    else:
                        logger.info("No exclusive partner data found, skipping exclusive_partners dataset creation")
                except Exception as e:
                    logger.error(f"Error creating exclusive partners dataset: {e}")

                # Create non-exclusive partners dataset - only if there's actual partner data
                try:
                    # Check if there are any non-empty partner arrays
                    has_non_exclusive_data = any(len(p) > 0 for p in non_exclusive_partners_data)
                    if has_non_exclusive_data:
                        logger.info("Creating non_exclusive_partners dataset...")
                        people_dset.create_dataset(
                            "non_exclusive_partners",
                            data=non_exclusive_partners_data,
                            dtype=h5py.vlen_dtype(np.dtype('int64')),
                            chunks=True,
                            maxshape=(None,)
                        )
                        logger.info("Non-exclusive partners dataset created successfully")
                    else:
                        logger.info("No non-exclusive partner data found, skipping non_exclusive_partners dataset creation")
                except Exception as e:
                    logger.error(f"Error creating non-exclusive partners dataset: {e}")
                # Note: We're no longer using casual_partners in the new relationship model

            else:
                current_len = people_dset["id"].shape[0]
                new_len = current_len + len(ids)

                # Update all datasets
                ds = people_dset["id"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(ids, dtype=np.int64)

                ds = people_dset["age"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(ages, dtype=np.int64)

                ds = people_dset["sex"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sexes, dtype="S10")

                ds = people_dset["sector"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sectors, dtype="S30")

                ds = people_dset["sub_sector"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sub_sectors, dtype="S30")

                ds = people_dset["ethnicity"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(ethns, dtype="S10")

                ds = people_dset["area"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(areas, dtype=np.int64)

                ds = people_dset["super_area"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(super_areas, dtype=np.int64)

                ds = people_dset["work_super_area"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(work_super_areas, dtype=np.int64)

                ds = people_dset["work_super_area_coords"]
                ds.resize((new_len, 2))
                ds[current_len:new_len, :] = np.array(work_super_area_coords, dtype=np.float64)

                ds = people_dset["work_super_area_city"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(work_super_areas_cities, dtype=np.int64)

                ds = people_dset["group_ids"]
                ds.resize((new_len, group_ids[0].size))
                ds[current_len:new_len, :] = np.array(group_ids, dtype=np.int64)

                ds = people_dset["group_specs"]
                ds.resize((new_len, group_specs[0].size))
                ds[current_len:new_len, :] = np.array(group_specs, dtype="S20")

                ds = people_dset["subgroup_types"]
                ds.resize((new_len, subgroup_types[0].size))
                ds[current_len:new_len, :] = np.array(subgroup_types, dtype=np.int64)

                ds = people_dset["group_super_areas"]
                ds.resize((new_len, group_super_areas[0].size))
                ds[current_len:new_len, :] = np.array(group_super_areas, dtype=np.int64)

                ds = people_dset["mode_of_transport_description"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(mode_of_transport_description, dtype="S100")

                ds = people_dset["mode_of_transport_is_public"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(mode_of_transport_is_public, dtype=bool)

                ds = people_dset["lockdown_status"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(lockdown_status, dtype="S20")

                # Update friends dataset with better error handling
                try:
                    if "friends" in people_dset:
                        friends_dataset = people_dset["friends"]
                        friends_dataset.resize((new_len,))
                        for i, friends in enumerate(friends_data):
                            friends_dataset[current_len + i] = friends
                    elif chunk == 1:  # Only log once for the first non-creation chunk
                        logger.info("Friends dataset not found, skipping updates for all chunks")
                except Exception as e:
                    logger.error(f"Error updating friends dataset: {e}")

                # Update hobbies dataset with better error handling
                try:
                    if "hobbies" in people_dset:
                        hobbies_dataset = people_dset["hobbies"]
                        hobbies_dataset.resize((new_len,))
                        for i, hob_array in enumerate(hobbies_data):
                            hobbies_dataset[current_len + i] = hob_array
                    elif chunk == 1:  # Only log once for the first non-creation chunk
                        logger.info("Hobbies dataset not found, skipping updates for all chunks")
                except Exception as e:
                    logger.error(f"Error updating hobbies dataset: {e}")

                # Update sexual relationship datasets
                # Relationship status
                ds = people_dset["relationship_status_type"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(relationship_status_type, dtype="S20")

                ds = people_dset["relationship_status_consensual"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(relationship_status_consensual, dtype=bool)

                # Sexual orientation
                ds = people_dset["sexual_orientation_type"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sexual_orientation_type, dtype="S20")

                # Sexual risk profile
                ds = people_dset["sexual_risk_profile_level"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sexual_risk_profile_level, dtype="S10")

                ds = people_dset["sexual_risk_profile_protection"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sexual_risk_profile_protection, dtype=np.float64)

                ds = people_dset["sexual_risk_profile_testing"]
                ds.resize((new_len,))
                ds[current_len:new_len] = np.array(sexual_risk_profile_testing, dtype=np.int64)

                # Update sexual partners datasets with better error handling
                try:
                    if "exclusive_partners" in people_dset:
                        exclusive_partners_dataset = people_dset["exclusive_partners"]
                        exclusive_partners_dataset.resize((new_len,))
                        for i, partners in enumerate(exclusive_partners_data):
                            exclusive_partners_dataset[current_len + i] = partners
                    elif chunk == 1:  # Only log once for the first non-creation chunk
                        logger.info("Exclusive partners dataset not found, skipping updates for all chunks")
                except Exception as e:
                    logger.error(f"Error updating exclusive partners dataset: {e}")

                try:
                    if "non_exclusive_partners" in people_dset:
                        non_exclusive_partners_dataset = people_dset["non_exclusive_partners"]
                        non_exclusive_partners_dataset.resize((new_len,))
                        for i, partners in enumerate(non_exclusive_partners_data):
                            non_exclusive_partners_dataset[current_len + i] = partners
                    elif chunk == 1:  # Only log once for the first non-creation chunk
                        logger.info("Non-exclusive partners dataset not found, skipping updates for all chunks")
                except Exception as e:
                    logger.error(f"Error updating non-exclusive partners dataset: {e}")

    logger.info("Population saved successfully.")