diff --git a/.benchmarks/Linux-CPython-3.11-64bit/0001_pr_vectorized.json b/.benchmarks/Linux-CPython-3.11-64bit/0001_pr_vectorized.json deleted file mode 100644 index 9963567d6..000000000 --- a/.benchmarks/Linux-CPython-3.11-64bit/0001_pr_vectorized.json +++ /dev/null @@ -1,725 +0,0 @@ -{ - "machine_info": { - "node": "msika", - "processor": "x86_64", - "machine": "x86_64", - "python_compiler": "Clang 18.1.8 ", - "python_implementation": "CPython", - "python_implementation_version": "3.11.11", - "python_version": "3.11.11", - "python_build": [ - "main", - "Dec 6 2024 20:02:44" - ], - "release": "6.8.0-100-generic", - "system": "Linux", - "cpu": { - "python_version": "3.11.11.final.0 (64 bit)", - "cpuinfo_version": [ - 9, - 0, - 0 - ], - "cpuinfo_version_string": "9.0.0", - "arch": "X86_64", - "bits": 64, - "count": 8, - "arch_string_raw": "x86_64", - "vendor_id_raw": "GenuineIntel", - "brand_raw": "11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz", - "hz_advertised_friendly": "3.0000 GHz", - "hz_actual_friendly": "3.2761 GHz", - "hz_advertised": [ - 3000000000, - 0 - ], - "hz_actual": [ - 3276077000, - 0 - ], - "stepping": 1, - "model": 140, - "family": 6, - "flags": [ - "3dnowprefetch", - "abm", - "acpi", - "adx", - "aes", - "aperfmperf", - "apic", - "arat", - "arch_capabilities", - "arch_perfmon", - "art", - "avx", - "avx2", - "avx512_bitalg", - "avx512_vbmi2", - "avx512_vnni", - "avx512_vp2intersect", - "avx512_vpopcntdq", - "avx512bw", - "avx512cd", - "avx512dq", - "avx512f", - "avx512ifma", - "avx512vbmi", - "avx512vl", - "bmi1", - "bmi2", - "bts", - "cat_l2", - "cdp_l2", - "clflush", - "clflushopt", - "clwb", - "cmov", - "constant_tsc", - "cpuid", - "cpuid_fault", - "cx16", - "cx8", - "de", - "ds_cpl", - "dtes64", - "dtherm", - "dts", - "epb", - "erms", - "est", - "f16c", - "flush_l1d", - "fma", - "fpu", - "fsgsbase", - "fsrm", - "fxsr", - "gfni", - "ht", - "hwp", - "hwp_act_window", - "hwp_epp", - "hwp_notify", - "hwp_pkg_req", - "ibpb", - "ibrs", - "ibrs_enhanced", - "ibt", - "ida", - "intel_pt", - "invpcid", - "lahf_lm", - "lm", - "mca", - "mce", - "md_clear", - "mmx", - "monitor", - "movbe", - "movdir64b", - "movdiri", - "msr", - "mtrr", - "nonstop_tsc", - "nopl", - "nx", - "ospke", - "pae", - "pat", - "pbe", - "pcid", - "pclmulqdq", - "pdcm", - "pdpe1gb", - "pebs", - "pge", - "pku", - "pln", - "pni", - "popcnt", - "pse", - "pse36", - "pts", - "rdpid", - "rdrand", - "rdseed", - "rdt_a", - "rdtscp", - "rep_good", - "sdbg", - "sep", - "sha_ni", - "smap", - "smep", - "smx", - "split_lock_detect", - "ss", - "ssbd", - "sse", - "sse2", - "sse4_1", - "sse4_2", - "ssse3", - "stibp", - "syscall", - "tm", - "tm2", - "tme", - "tsc", - "tsc_adjust", - "tsc_deadline_timer", - "tsc_known_freq", - "umip", - "user_shstk", - "vaes", - "vme", - "vpclmulqdq", - "x2apic", - "xgetbv1", - "xsave", - "xsavec", - "xsaveopt", - "xsaves", - "xtopology", - "xtpr" - ], - "l3_cache_size": 12582912, - "l2_cache_size": 5242880, - "l1_data_cache_size": 196608, - "l1_instruction_cache_size": 131072 - } - }, - "commit_info": { - "id": "a644bfc91badea3f9fb136b6597c6d1f2bc8265e", - "time": "2026-02-23T00:30:08+01:00", - "author_time": "2026-02-23T00:30:08+01:00", - "dirty": false, - "project": "openfisca-core", - "branch": "perf/vectorize-members-position" - }, - "benchmarks": [ - { - "group": null, - "name": "test_members_position[N=100]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=100]", - "params": { - "nb_persons": 100, - "nb_entities": 40 - }, - "param": "N=100", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.0099999296168486e-05, - "max": 1.755666744429618e-05, - "mean": 1.3077266824742158e-05, - "stddev": 2.8922403625739856e-06, - "rounds": 5, - "median": 1.2320000678300858e-05, - "iqr": 3.898921325647583e-06, - "q1": 1.1036247694088766e-05, - "q3": 1.493516901973635e-05, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 1.0099999296168486e-05, - "hd15iqr": 1.755666744429618e-05, - "ops": 76468.57813652638, - "total": 6.53863341237108e-05, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_members_position[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=10K]", - "params": { - "nb_persons": 10000, - "nb_entities": 4000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.0005148546673202267, - "max": 0.0005647429982976367, - "mean": 0.0005284982655818264, - "stddev": 2.0705436398245678e-05, - "rounds": 5, - "median": 0.0005230313302793851, - "iqr": 1.9040831830352545e-05, - "q1": 0.000515377666791513, - "q3": 0.0005344184986218655, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 0.0005148546673202267, - "hd15iqr": 0.0005647429982976367, - "ops": 1892.1537971351618, - "total": 0.002642491327909132, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_members_position[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=100K]", - "params": { - "nb_persons": 100000, - "nb_entities": 40000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.007979656336829066, - "max": 0.01013264699819653, - "mean": 0.00884707306686323, - "stddev": 0.0009757951036471044, - "rounds": 5, - "median": 0.00832884966803249, - "iqr": 0.0016657599141277988, - "q1": 0.008103987584036076, - "q3": 0.009769747498163875, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 0.007979656336829066, - "hd15iqr": 0.01013264699819653, - "ops": 113.0317329180321, - "total": 0.04423536533431616, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_members_position[N=1M]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=1M]", - "params": { - "nb_persons": 1000000, - "nb_entities": 400000 - }, - "param": "N=1M", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.10654892533299669, - "max": 0.11162933666491881, - "mean": 0.10784557700001945, - "stddev": 0.0021598373837389945, - "rounds": 5, - "median": 0.10670083000150044, - "iqr": 0.001981643080701659, - "q1": 0.1066622585846441, - "q3": 0.10864390166534577, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 0.10654892533299669, - "hd15iqr": 0.11162933666491881, - "ops": 9.272517499719248, - "total": 0.5392278850000972, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_household_sum[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_sum[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 2.4007001775316895e-05, - "max": 3.297280054539442e-05, - "mean": 2.6627079932950437e-05, - "stddev": 3.747700367109457e-06, - "rounds": 5, - "median": 2.51803983701393e-05, - "iqr": 4.4557498767971965e-06, - "q1": 2.4010299966903403e-05, - "q3": 2.84660498437006e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 2.4007001775316895e-05, - "hd15iqr": 3.297280054539442e-05, - "ops": 37555.75160769025, - "total": 0.00013313539966475219, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_household_sum[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_sum[N=100K]", - "params": { - "nb_persons": 100000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.0001577196002472192, - "max": 0.00016500779893249273, - "mean": 0.00015983815945219248, - "stddev": 2.9402275710920996e-06, - "rounds": 5, - "median": 0.00015877099940553308, - "iqr": 2.362348459428198e-06, - "q1": 0.00015829440017114393, - "q3": 0.00016065674863057213, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 0.0001577196002472192, - "hd15iqr": 0.00016500779893249273, - "ops": 6256.328297493313, - "total": 0.0007991907972609624, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_household_any[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_any[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 2.861660032067448e-05, - "max": 3.0083800083957613e-05, - "mean": 2.939747937489301e-05, - "stddev": 7.171345068488999e-07, - "rounds": 5, - "median": 2.965019957628101e-05, - "iqr": 1.3729993952438242e-06, - "q1": 2.863984918803908e-05, - "q3": 3.0012848583282903e-05, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 2.861660032067448e-05, - "hd15iqr": 3.0083800083957613e-05, - "ops": 34016.52186731535, - "total": 0.00014698739687446505, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_household_any[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_any[N=100K]", - "params": { - "nb_persons": 100000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.0001826840016292408, - "max": 0.00019266739836893977, - "mean": 0.00018506352033000438, - "stddev": 4.278379758498394e-06, - "rounds": 5, - "median": 0.0001832356007071212, - "iqr": 3.3362986869178604e-06, - "q1": 0.00018277475101058371, - "q3": 0.00018611104969750157, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 0.0001826840016292408, - "hd15iqr": 0.00019266739836893977, - "ops": 5403.550079544606, - "total": 0.000925317601650022, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_disposable_income[N=100]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_disposable_income[N=100]", - "params": { - "nb_persons": 100 - }, - "param": "N=100", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.0567993740551174e-05, - "max": 2.168200444430113e-05, - "mean": 1.4904998048829535e-05, - "stddev": 5.945209510176881e-06, - "rounds": 3, - "median": 1.2464995961636305e-05, - "iqr": 8.335508027812466e-06, - "q1": 1.1042244295822456e-05, - "q3": 1.9377752323634923e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 1.0567993740551174e-05, - "hd15iqr": 2.168200444430113e-05, - "ops": 67091.5887894751, - "total": 4.471499414648861e-05, - "iterations": 1 - } - }, - { - "group": null, - "name": "test_disposable_income[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_disposable_income[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.6583013348281384e-05, - "max": 4.4988002628088e-05, - "mean": 2.6493338130724926e-05, - "stddev": 1.603056518817539e-05, - "rounds": 3, - "median": 1.79089984158054e-05, - "iqr": 2.130374195985496e-05, - "q1": 1.6914509615162387e-05, - "q3": 3.821825157501735e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 1.6583013348281384e-05, - "hd15iqr": 4.4988002628088e-05, - "ops": 37745.337905919725, - "total": 7.948001439217478e-05, - "iterations": 1 - } - }, - { - "group": null, - "name": "test_disposable_income[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_disposable_income[N=100K]", - "params": { - "nb_persons": 100000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.4842007658444345e-05, - "max": 7.90880003478378e-05, - "mean": 3.776266627634565e-05, - "stddev": 3.585994915048551e-05, - "rounds": 3, - "median": 1.93579908227548e-05, - "iqr": 4.8184494517045096e-05, - "q1": 1.597100344952196e-05, - "q3": 6.415549796656705e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 1.4842007658444345e-05, - "hd15iqr": 7.90880003478378e-05, - "ops": 26481.18098129091, - "total": 0.00011328799882903695, - "iterations": 1 - } - }, - { - "group": null, - "name": "test_income_tax[N=100]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_income_tax[N=100]", - "params": { - "nb_persons": 100 - }, - "param": "N=100", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.4769670087844133e-05, - "max": 1.666599807019035e-05, - "mean": 1.5689467545598745e-05, - "stddev": 7.112743567923531e-07, - "rounds": 5, - "median": 1.584399918404718e-05, - "iqr": 9.213351101304105e-07, - "q1": 1.5156667359406129e-05, - "q3": 1.607800246953654e-05, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 1.4769670087844133e-05, - "hd15iqr": 1.666599807019035e-05, - "ops": 63737.02594391247, - "total": 7.844733772799373e-05, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_income_tax[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_income_tax[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 8.9913325306649e-06, - "max": 1.019566843751818e-05, - "mean": 9.368267880442242e-06, - "stddev": 4.85979667036985e-07, - "rounds": 5, - "median": 9.170000945838789e-06, - "iqr": 5.308368902963902e-07, - "q1": 9.064583233945692e-06, - "q3": 9.595420124242082e-06, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 8.9913325306649e-06, - "hd15iqr": 1.019566843751818e-05, - "ops": 106743.31826992908, - "total": 4.6841339402211205e-05, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_tbs_loading", - "fullname": "benchmarks/test_bench_compute.py::TestTBSLoadingBench::test_tbs_loading", - "params": null, - "param": null, - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.005803033986012451, - "max": 0.006510506005724892, - "mean": 0.0060930316637192545, - "stddev": 0.00037056293611823675, - "rounds": 3, - "median": 0.005965554999420419, - "iqr": 0.0005306040147843305, - "q1": 0.005843664239364443, - "q3": 0.006374268254148774, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 0.005803033986012451, - "hd15iqr": 0.006510506005724892, - "ops": 164.1219109289166, - "total": 0.018279094991157763, - "iterations": 1 - } - } - ], - "datetime": "2026-02-23T11:59:20.556345+00:00", - "version": "5.2.3" -} \ No newline at end of file diff --git a/.benchmarks/Linux-CPython-3.11-64bit/0002_master_loop.json b/.benchmarks/Linux-CPython-3.11-64bit/0002_master_loop.json deleted file mode 100644 index 168661323..000000000 --- a/.benchmarks/Linux-CPython-3.11-64bit/0002_master_loop.json +++ /dev/null @@ -1,725 +0,0 @@ -{ - "machine_info": { - "node": "msika", - "processor": "x86_64", - "machine": "x86_64", - "python_compiler": "Clang 18.1.8 ", - "python_implementation": "CPython", - "python_implementation_version": "3.11.11", - "python_version": "3.11.11", - "python_build": [ - "main", - "Dec 6 2024 20:02:44" - ], - "release": "6.8.0-100-generic", - "system": "Linux", - "cpu": { - "python_version": "3.11.11.final.0 (64 bit)", - "cpuinfo_version": [ - 9, - 0, - 0 - ], - "cpuinfo_version_string": "9.0.0", - "arch": "X86_64", - "bits": 64, - "count": 8, - "arch_string_raw": "x86_64", - "vendor_id_raw": "GenuineIntel", - "brand_raw": "11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz", - "hz_advertised_friendly": "3.0000 GHz", - "hz_actual_friendly": "1.3731 GHz", - "hz_advertised": [ - 3000000000, - 0 - ], - "hz_actual": [ - 1373112000, - 0 - ], - "stepping": 1, - "model": 140, - "family": 6, - "flags": [ - "3dnowprefetch", - "abm", - "acpi", - "adx", - "aes", - "aperfmperf", - "apic", - "arat", - "arch_capabilities", - "arch_perfmon", - "art", - "avx", - "avx2", - "avx512_bitalg", - "avx512_vbmi2", - "avx512_vnni", - "avx512_vp2intersect", - "avx512_vpopcntdq", - "avx512bw", - "avx512cd", - "avx512dq", - "avx512f", - "avx512ifma", - "avx512vbmi", - "avx512vl", - "bmi1", - "bmi2", - "bts", - "cat_l2", - "cdp_l2", - "clflush", - "clflushopt", - "clwb", - "cmov", - "constant_tsc", - "cpuid", - "cpuid_fault", - "cx16", - "cx8", - "de", - "ds_cpl", - "dtes64", - "dtherm", - "dts", - "epb", - "erms", - "est", - "f16c", - "flush_l1d", - "fma", - "fpu", - "fsgsbase", - "fsrm", - "fxsr", - "gfni", - "ht", - "hwp", - "hwp_act_window", - "hwp_epp", - "hwp_notify", - "hwp_pkg_req", - "ibpb", - "ibrs", - "ibrs_enhanced", - "ibt", - "ida", - "intel_pt", - "invpcid", - "lahf_lm", - "lm", - "mca", - "mce", - "md_clear", - "mmx", - "monitor", - "movbe", - "movdir64b", - "movdiri", - "msr", - "mtrr", - "nonstop_tsc", - "nopl", - "nx", - "ospke", - "pae", - "pat", - "pbe", - "pcid", - "pclmulqdq", - "pdcm", - "pdpe1gb", - "pebs", - "pge", - "pku", - "pln", - "pni", - "popcnt", - "pse", - "pse36", - "pts", - "rdpid", - "rdrand", - "rdseed", - "rdt_a", - "rdtscp", - "rep_good", - "sdbg", - "sep", - "sha_ni", - "smap", - "smep", - "smx", - "split_lock_detect", - "ss", - "ssbd", - "sse", - "sse2", - "sse4_1", - "sse4_2", - "ssse3", - "stibp", - "syscall", - "tm", - "tm2", - "tme", - "tsc", - "tsc_adjust", - "tsc_deadline_timer", - "tsc_known_freq", - "umip", - "user_shstk", - "vaes", - "vme", - "vpclmulqdq", - "x2apic", - "xgetbv1", - "xsave", - "xsavec", - "xsaveopt", - "xsaves", - "xtopology", - "xtpr" - ], - "l3_cache_size": 12582912, - "l2_cache_size": 5242880, - "l1_data_cache_size": 196608, - "l1_instruction_cache_size": 131072 - } - }, - "commit_info": { - "id": "44e17408e0b1a242e63b0819574fd23fcf5a01d3", - "time": "2026-02-19T16:45:21+01:00", - "author_time": "2026-02-19T16:45:21+01:00", - "dirty": false, - "project": "openfisca-core", - "branch": "master" - }, - "benchmarks": [ - { - "group": null, - "name": "test_members_position[N=100]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=100]", - "params": { - "nb_persons": 100, - "nb_entities": 40 - }, - "param": "N=100", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 4.0168665388288595e-05, - "max": 4.5296333458585046e-05, - "mean": 4.194106732029468e-05, - "stddev": 1.9675315493748894e-06, - "rounds": 5, - "median": 4.125066819445541e-05, - "iqr": 1.7686713060053731e-06, - "q1": 4.091991528791065e-05, - "q3": 4.268858659391602e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 4.0168665388288595e-05, - "hd15iqr": 4.5296333458585046e-05, - "ops": 23842.979301485597, - "total": 0.0002097053366014734, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_members_position[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=10K]", - "params": { - "nb_persons": 10000, - "nb_entities": 4000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.004260326333072347, - "max": 0.004545084331766702, - "mean": 0.004424344800645486, - "stddev": 0.00012371705582191393, - "rounds": 5, - "median": 0.00443869833543431, - "iqr": 0.00021784924926275106, - "q1": 0.004320857334581281, - "q3": 0.004538706583844032, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 0.004260326333072347, - "hd15iqr": 0.004545084331766702, - "ops": 226.02216713627428, - "total": 0.022121724003227428, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_members_position[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=100K]", - "params": { - "nb_persons": 100000, - "nb_entities": 40000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.04322176233108621, - "max": 0.04443494733034944, - "mean": 0.04370555639907252, - "stddev": 0.0004796096321093509, - "rounds": 5, - "median": 0.04364692333426016, - "iqr": 0.0006902820035369003, - "q1": 0.043321003580786055, - "q3": 0.044011285584322955, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 0.04322176233108621, - "hd15iqr": 0.04443494733034944, - "ops": 22.880385982712742, - "total": 0.2185277819953626, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_members_position[N=1M]", - "fullname": "benchmarks/test_bench_compute.py::TestMembersPositionBench::test_members_position[N=1M]", - "params": { - "nb_persons": 1000000, - "nb_entities": 400000 - }, - "param": "N=1M", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.4722738350004268, - "max": 0.5827776293299394, - "mean": 0.502778451066115, - "stddev": 0.047362587743401435, - "rounds": 5, - "median": 0.4768990046674541, - "iqr": 0.055565359331618935, - "q1": 0.4723269232502692, - "q3": 0.5278922825818881, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 0.4722738350004268, - "hd15iqr": 0.5827776293299394, - "ops": 1.988947612769706, - "total": 2.513892255330575, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_household_sum[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_sum[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 2.3738198797218503e-05, - "max": 3.262080135755241e-05, - "mean": 2.6451560552231966e-05, - "stddev": 3.5561948497983546e-06, - "rounds": 5, - "median": 2.509420155547559e-05, - "iqr": 3.3431009796913767e-06, - "q1": 2.4425049923593177e-05, - "q3": 2.7768150903284554e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 2.3738198797218503e-05, - "hd15iqr": 3.262080135755241e-05, - "ops": 37804.95286942987, - "total": 0.00013225780276115984, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_household_sum[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_sum[N=100K]", - "params": { - "nb_persons": 100000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.00016112519952002913, - "max": 0.0001934773987159133, - "mean": 0.0001690311194397509, - "stddev": 1.3731218458406051e-05, - "rounds": 5, - "median": 0.00016332859813701363, - "iqr": 9.867047629086322e-06, - "q1": 0.00016210095127462408, - "q3": 0.0001719679989037104, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 0.00016112519952002913, - "hd15iqr": 0.0001934773987159133, - "ops": 5916.070385822876, - "total": 0.0008451555971987546, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_household_any[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_any[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 2.868400188162923e-05, - "max": 3.1998599297367036e-05, - "mean": 3.0395799549296497e-05, - "stddev": 1.3838419379124302e-06, - "rounds": 5, - "median": 3.0171198886819184e-05, - "iqr": 2.3490480089094436e-06, - "q1": 2.933275027316995e-05, - "q3": 3.168179828207939e-05, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 2.868400188162923e-05, - "hd15iqr": 3.1998599297367036e-05, - "ops": 32899.28262548846, - "total": 0.00015197899774648248, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_household_any[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestGroupAggregationBench::test_household_any[N=100K]", - "params": { - "nb_persons": 100000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.00019117179908789694, - "max": 0.00026818519982043656, - "mean": 0.00020906075951643288, - "stddev": 3.327185668407188e-05, - "rounds": 5, - "median": 0.00019381099846214057, - "iqr": 2.6140901900362272e-05, - "q1": 0.00019140009899274444, - "q3": 0.0002175410008931067, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 0.00019117179908789694, - "hd15iqr": 0.00026818519982043656, - "ops": 4783.298416752363, - "total": 0.0010453037975821646, - "iterations": 5 - } - }, - { - "group": null, - "name": "test_disposable_income[N=100]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_disposable_income[N=100]", - "params": { - "nb_persons": 100 - }, - "param": "N=100", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.0382995242252946e-05, - "max": 2.2827007342129946e-05, - "mean": 1.500966512442877e-05, - "stddev": 6.80798441170266e-06, - "rounds": 3, - "median": 1.1818992788903415e-05, - "iqr": 9.33300907490775e-06, - "q1": 1.0741994628915563e-05, - "q3": 2.0075003703823313e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 1.0382995242252946e-05, - "hd15iqr": 2.2827007342129946e-05, - "ops": 66623.73821868045, - "total": 4.502899537328631e-05, - "iterations": 1 - } - }, - { - "group": null, - "name": "test_disposable_income[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_disposable_income[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.4613004168495536e-05, - "max": 5.059099930804223e-05, - "mean": 2.7450664977853496e-05, - "stddev": 2.00801604780174e-05, - "rounds": 3, - "median": 1.7147991457022727e-05, - "iqr": 2.698349635466002e-05, - "q1": 1.5246750990627334e-05, - "q3": 4.223024734528735e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 1.4613004168495536e-05, - "hd15iqr": 5.059099930804223e-05, - "ops": 36428.99000103548, - "total": 8.235199493356049e-05, - "iterations": 1 - } - }, - { - "group": null, - "name": "test_disposable_income[N=100K]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_disposable_income[N=100K]", - "params": { - "nb_persons": 100000 - }, - "param": "N=100K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.988699659705162e-05, - "max": 8.648600487504154e-05, - "mean": 4.4254668561431267e-05, - "stddev": 3.67177049104301e-05, - "rounds": 3, - "median": 2.639100421220064e-05, - "iqr": 4.994925620849244e-05, - "q1": 2.1512998500838876e-05, - "q3": 7.146225470933132e-05, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 1.988699659705162e-05, - "hd15iqr": 8.648600487504154e-05, - "ops": 22596.48603201873, - "total": 0.0001327640056842938, - "iterations": 1 - } - }, - { - "group": null, - "name": "test_income_tax[N=100]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_income_tax[N=100]", - "params": { - "nb_persons": 100 - }, - "param": "N=100", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.3522667965541283e-05, - "max": 1.6120999741057556e-05, - "mean": 1.481039992844065e-05, - "stddev": 1.0281796533679763e-06, - "rounds": 5, - "median": 1.4928999007679522e-05, - "iqr": 1.6150843293871748e-06, - "q1": 1.3952666146603102e-05, - "q3": 1.5567750475990277e-05, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 1.3522667965541283e-05, - "hd15iqr": 1.6120999741057556e-05, - "ops": 67520.12132229352, - "total": 7.405199964220326e-05, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_income_tax[N=10K]", - "fullname": "benchmarks/test_bench_compute.py::TestFullSimulationBench::test_income_tax[N=10K]", - "params": { - "nb_persons": 10000 - }, - "param": "N=10K", - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 1.3846001820638776e-05, - "max": 1.6765664137589436e-05, - "mean": 1.5520599360267323e-05, - "stddev": 1.1708157659204103e-06, - "rounds": 5, - "median": 1.601499873989572e-05, - "iqr": 1.7261687996021173e-06, - "q1": 1.4579498383682221e-05, - "q3": 1.630566718328434e-05, - "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 1.3846001820638776e-05, - "hd15iqr": 1.6765664137589436e-05, - "ops": 64430.50147663733, - "total": 7.76029968013366e-05, - "iterations": 3 - } - }, - { - "group": null, - "name": "test_tbs_loading", - "fullname": "benchmarks/test_bench_compute.py::TestTBSLoadingBench::test_tbs_loading", - "params": null, - "param": null, - "extra_info": {}, - "options": { - "disable_gc": false, - "timer": "perf_counter", - "min_rounds": 5, - "max_time": 1.0, - "min_time": 5e-06, - "warmup": false - }, - "stats": { - "min": 0.009633206005673856, - "max": 0.010449794004671276, - "mean": 0.009932148668061322, - "stddev": 0.00045008569718521806, - "rounds": 3, - "median": 0.009713445993838832, - "iqr": 0.0006124409992480651, - "q1": 0.0096532660027151, - "q3": 0.010265707001963165, - "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 0.009633206005673856, - "hd15iqr": 0.010449794004671276, - "ops": 100.68314857344885, - "total": 0.029796446004183963, - "iterations": 1 - } - } - ], - "datetime": "2026-02-23T12:00:01.542713+00:00", - "version": "5.2.3" -} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e9fff13c..d9016e177 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,88 +1,12 @@ # Changelog -## 44.5.0 [#1368](https://github.com/openfisca/openfisca-core/pull/1368) - -#### New features - -- Add `transition_formula` to `Variable` for formula-driven `as_of` forward simulation. - - A variable with `transition_formula` computes sparse updates instead of full arrays: the formula returns `(selector, values)` where `selector` is a boolean mask or index array, and `values` is the new values for the selected individuals. - - Each call to `get_array` at a new period triggers the transition formula once (guarded by `_as_of_transition_computed`), applies the sparse diff via `set_input_sparse`, and caches the result. - - `set_input_sparse` is also exposed as a public method on `Holder` for callers that want to apply sparse patches directly. - -- Add `initial_formula` to `Variable` for seeding `as_of` variables without a prior `set_input`. - - When a `transition_formula` needs to read the variable at `period - 1` but no base snapshot exists, OpenFisca now calls `initial_formula` instead of raising an error. - - `initial_formula` follows the same date-dispatch convention as regular formulas (`initial_formula_YYYY`, `initial_formula_YYYY_MM`, etc.). - - Requires `as_of = True` on the same variable; a `ValueError` is raised at definition time otherwise. - -- Add multi-snapshot LRU cache to `as_of` variable holders. - - Replaces the previous single-entry snapshot cursor with an `OrderedDict`-based LRU cache keeping the K most-recently-used reconstructed snapshots. - - Cache size defaults to 3 and is configurable per variable (`Variable.snapshot_count`) or globally (`MemoryConfig.asof_max_snapshots`), with variable-level taking priority. - - Retroactive `set_input` (out-of-order writes) evicts all cached snapshots at or after the written instant to preserve correctness. - -- Add `formula_type` field to `TraceNode` for `as_of` formula visibility. - - When `transition_formula` or `initial_formula` runs, the tracer records `formula_type = "transition"` or `formula_type = "initial"` on the corresponding trace node. - -- Add `show_formula_type` option to `computation_log`. - - `simulation.tracer.computation_log.print_log(show_formula_type=True)` appends `[transition]` or `[initial]` tags to the relevant lines, making it easy to see which `as_of` formula ran during a simulation. - -#### Bug fixes - -- Fix false `SpiralError` when a `transition_formula` reads its own variable at the previous period. - - The existing spiral detector raised `SpiralError` immediately when the same variable appeared in the call stack at any different period, which always triggers for temporal recursion (`V@P` → `V@P-1` → `V@P-2`). - - Fix: in `_calculate_transition`, the cycle check is replaced by `_check_for_strict_cycle`, which only raises `CycleError` for the exact same `(variable, period)` pair. Termination is guaranteed by `_as_of_transition_computed`. - -## 44.4.1 - -#### Performance improvements - -- Fix quadratic reconstruction cost in `as_of` forward simulations. - - In the typical GET(M-1) → compute → SET(M) monthly loop, `_set_as_of` was unconditionally clearing the snapshot cursor after each write, forcing the next `get_array(M)` to reconstruct from the base through all M patches — O(N + M·k) per step, quadratic overall. - - Root cause: `_reconstruct_at` advanced the snapshot to `instant` during the internal diff computation, so the invalidation guard `snapshot[0] >= instant` triggered on equality even for strictly forward writes. - - Fix: when the new patch is appended at the end of the list (forward-sequential SET), the snapshot is updated to the new state instead of being discarded. Retroactive (out-of-order) writes still invalidate the snapshot correctly. - - Benchmark (N=1M, forward simulation): 1 yr / 10% change ×1.4, 5 yr / 10% ×4.1, 5 yr / 30% ×5.4. - -## 44.4.0 [#1366](https://github.com/openfisca/openfisca-core/pull/1366) - -#### Performance improvements - -- Replace dense array storage with sparse patch storage for `as_of` variables. - - Instead of storing one full array per `set_input` call, the holder now keeps a single base array and a list of `(instant, changed_indices, changed_values)` patches. - - Memory reduction: ~60× for a 0.5% monthly change rate over 120 months (e.g. ~4 MB vs ~240 MB for 1M individuals). - - GET performance: a snapshot cursor makes forward-sequential reads O(k) (only new patches applied) instead of O(N); backward jumps degrade gracefully to O(N + k×P). - - Retroactive `set_input` (out-of-order patches) is supported with automatic snapshot invalidation. - - No change to the public API (`set_input`, `get_array`, `Variable.as_of`). -- Fix quadratic reconstruction cost in `as_of` forward simulations: when the new patch is appended at the end (forward-sequential SET), the snapshot is updated instead of discarded so the next GET does not reconstruct from base through all patches; retroactive writes still invalidate correctly. - -## 44.4.0 [#1364](https://github.com/openfisca/openfisca-core/pull/1364) - -#### New features - -- **Entity links**: role-based and positional accessors, and dynamic population period-index helpers. - - `Many2OneLink.get_by_role(variable_name, period, role_value=...)`, `One2ManyLink.get_by_role(...)` and `ImplicitOne2ManyLink.get_by_role(...)`. - - `Many2OneLink.rank(variable_name, period)` (and on chained getter, e.g. `person.links["mother"].household.rank("age", period)`). - - `One2ManyLink.nth(n, variable_name, period, role=..., condition=...)` for the n-th target member per source. - - `has_role(role_value)` now supports `Role` objects (comparison by `.key`) in addition to raw values. - - `CorePopulation.snapshot_period(period)` and `get_period_id_to_rownum(period)` for optional dynamic-population period indexing. - -#### Technical changes - -- Removed unused `openfisca_core.model_api` import in `tests/core/parameters_date_indexing/test_date_indexing.py`. -- SimulationBuilder sets `_id_to_rownum` identity mapping for static simulations (`build_default_simulation`, `build_from_dict` / `build_from_entities`), for dynamic-population support. -- Add `PYTHON` variable to `tasks/lint.mk` so `make lint PYTHON=.venv/bin/python` works; fix style in `test_link_accessors.py` and remove unused variable in `test_many2one.py`. - -## 44.3.0 [#1365](https://github.com/openfisca/openfisca-core/pull/1365) - -#### New features - -- **Generic Entity Links (Phase 1-6)**: Introduced a new Liam2-inspired generic entity linking system avoiding rigid hierarchies like `Person -> Household`. - - Added new `Many2OneLink` and `One2ManyLink` models to create powerful inter-entity networks (e.g., `Person -> Employer`). - - Added implicit links directly binding members arrays. This powers the new `population.links` property natively inside `TaxBenefitSystem.instantiate_entities()`. - - Full capability to chain relationships via python: `person.mother.household.get("rent", period)`. - - Powerful vectorized declarative aggregations out-of-the-box (e.g., `households.persons.sum("salary", period, condition=is_female)`). +## 44.6.0 #### Technical changes -- Backward compatibility is 100% maintained. Existing syntax via Projectors natively redirects to implicit links via modified `__getattr__`. +- Revert the codebase to commit `8d26af3` to align the repository with the currently published `44.2.2` baseline on PyPI. +- Mark versions `44.3.0`, `44.4.0`, `44.4.1`, and `44.5.0` as yanked. +- Bump version to `44.6.0` to restore continuous deployment from this reverted baseline. ## 44.2.2 diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md deleted file mode 100644 index d3447660c..000000000 --- a/PR_DESCRIPTION.md +++ /dev/null @@ -1,57 +0,0 @@ -# Feature: Generic Entity Links (LIAM2-inspired) - -## Context & Motivation - -OpenFisca's traditional entity model has historically been strictly hierarchical and bipartite: individuals belong to groups (households, families, tax units), and groups contain individuals. This rigid structure works well for static tax-benefit systems but struggles with complex, real-world socioeconomic models, such as: -- **Intra-entity relationships**: Kinship graphs (person $\rightarrow$ mother, person $\rightarrow$ spouse). -- **Arbitrary inter-entity networks**: Employment networks (person $\rightarrow$ employer), geographical mobility, or ad-hoc associations. -- **Deep chaining**: Navigating multiple relationship hops (e.g., "the region of the household of the mother of the person"). - -To solve this, we drew inspiration from [LIAM2's linking system](https://liam2.plan.be/) and adapted it to OpenFisca's unique architecture (specifically integrating with our `Role` semantics and vectorized execution). - -## What we did - -This PR introduces a generic, highly performant, and **100% backward-compatible** Entity Linking system. - -### 1. Core Link Classes (`openfisca_core/links`) -- **`Many2OneLink`**: Resolves *N* source members to *1* target entity (e.g., `person.mother`, `person.employer`). Supports fetching values (`.get()`) and dynamic chaining (`.mother.household.rent`). -- **`One2ManyLink`**: Aggregates from *N* target members back to *1* source entity. Supports a wide suite of vectorized aggregations (`sum`, `count`, `any`, `all`, `min`, `max`, `avg`) along with filtering by `role` or an arbitrary boolean `condition` mask. - -### 2. Implicit Links & Backward Compatibility -A major design goal was to avoid breaking existing country packages (`openfisca-france`, `openfisca-tunisia`, etc.). -- Links are strictly **additive**. -- During `Simulation` initialization, OpenFisca now automatically reads the existing `GroupEntity` structure and injects **Implicit Links**: - - `ImplicitMany2OneLink`: Automatically adds `person.household`, mapping directly to the high-performance `GroupPopulation.members_entity_id` array. - - `ImplicitOne2ManyLink`: Automatically adds `household.persons`, replacing the need for verbose legacy aggregations. -- `Population.__getattr__` was carefully patched to first check `self.links["..."]` before natively falling back to the legacy `get_projector_from_shortcut()` route. *Everything keeps working identically.* - -### 3. Syntax Sugar & Chaining -The new API allows natural, pythonic data fetching: -```python -# Old projector way (still works!): -rents = sim.persons.household("rent", "2024") - -# New explicit link definition (e.g., for arbitrary networks) -mother_link = Many2OneLink(name="mother", link_field="mother_id", target_entity_key="person") -person_entity.add_link(mother_link) - -# New chaining syntax: -mother_household_rents = sim.persons.mother.household.get("rent", "2024") - -# New declarative aggregations: -female_salaries = sim.households.persons.sum("salary", "2024", condition=is_female) -``` - -## Performance -Performance is a critical constraint for OpenFisca simulations. We added `pytest-benchmark` tests validating the new mechanics. -- `.get()` resolutions (Many-to-One) perform identically to legacy Projectors (~118μs on 15,000 entities). -- Aggregations (`One2Many.sum()`) introduce a negligible setup overhead (< 1ms) but execute fully vectorized `numpy.bincount` and `numpy.maximum.at` operations under the hood. - -## Associated Documentation -We've added guides to help framework users model new relationships: -- `docs/implementation/links-api.md`: Reference for creating and querying `Many2OneLink` and `One2ManyLink`. -- `docs/implementation/transition-guide.md`: Migration guide demonstrating how to gradually adopt Links over Legacy Projectors. - -## Testing -- 12 new, comprehensive tests covering unit mechanics, system integrations, filtering, chaining, and OpenFisca core lifecycle (`_resolve_links`). -- All 158 core tests and existing Country Template tests continue to pass locally (`make test-code`). diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index f562b70e6..000000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Benchmarks - -## How to run - -```bash -# Run all benchmarks -make benchmark - -# Run compute benchmarks only -.venv/bin/python -m pytest benchmarks/test_bench_compute.py -v --benchmark-sort=name - -# Run memory benchmarks only -.venv/bin/python -m pytest benchmarks/test_bench_memory.py -v -s - -# Save results for later comparison -.venv/bin/python -m pytest benchmarks/ --benchmark-save=my_baseline - -# Compare with a saved baseline -.venv/bin/python -m pytest benchmarks/ --benchmark-compare=0001_my_baseline -``` - -## Benchmarks included - -### Compute (`test_bench_compute.py`) - -| Benchmark | What it measures | Sizes | -|---|---|---| -| `members_position` | GroupPopulation position assignment | 100 → 1M | -| `group_sum` | `household.sum(salary)` | 100 → 1M | -| `disposable_income` | Full variable cascade (~15 vars) | 100 → 100K | -| `tbs_loading` | TaxBenefitSystem initialization | 1 | - -### Memory (`test_bench_memory.py`) - -| Benchmark | What it measures | Sizes | -|---|---|---| -| `members_position_memory` | Peak memory for position calc | 10K → 1M | -| `simulation_memory` | Peak memory for full simulation | 10K → 1M | -| `per_variable_memory` | Memory per variable per person | 10K → 100K | diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py deleted file mode 100644 index 83426ccf5..000000000 --- a/benchmarks/conftest.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Shared fixtures for OpenFisca benchmarks.""" - -import numpy -import pytest - - -@pytest.fixture(params=[100, 10_000, 100_000, 1_000_000], ids=lambda n: f"N={n:_}") -def population_size(request): - """Population sizes to benchmark.""" - return request.param - - -@pytest.fixture(params=[100, 10_000, 100_000], ids=lambda n: f"N={n:_}") -def simulation_size(request): - """Population sizes for full simulation benchmarks (capped for speed).""" - return request.param - - -@pytest.fixture -def rng(): - """Deterministic random number generator.""" - return numpy.random.default_rng(42) - - -@pytest.fixture -def make_group_population(): - """Factory to create a GroupPopulation with random entity assignment.""" - - def _make(nb_persons, nb_entities=None): - from openfisca_core.populations.group_population import GroupPopulation - - if nb_entities is None: - nb_entities = max(1, nb_persons // 3) - - rng = numpy.random.default_rng(42) - pop = GroupPopulation.__new__(GroupPopulation) - pop._members_entity_id = rng.integers(0, nb_entities, size=nb_persons) - pop._members_position = None - pop._ordered_members_map = None - return pop - - return _make - - -@pytest.fixture -def make_simulation(): - """Factory to create a Simulation with salary input.""" - - def _make(nb_persons): - from openfisca_country_template import CountryTaxBenefitSystem - - from openfisca_core.simulations import SimulationBuilder - - tbs = CountryTaxBenefitSystem() - sim = SimulationBuilder().build_default_simulation(tbs, count=nb_persons) - - rng = numpy.random.default_rng(42) - sim.set_input("salary", "2024-01", rng.uniform(1000, 5000, nb_persons)) - return sim - - return _make diff --git a/benchmarks/test_bench_asof.py b/benchmarks/test_bench_asof.py deleted file mode 100644 index 690f9a120..000000000 --- a/benchmarks/test_bench_asof.py +++ /dev/null @@ -1,313 +0,0 @@ -"""Benchmarks for as_of variable sparse patch storage. - -Run with: - .venv/bin/pytest benchmarks/test_bench_asof.py -v -s -k memory - .venv/bin/pytest benchmarks/test_bench_asof.py -v --benchmark-sort=name -k compute -""" - -from __future__ import annotations - -import numpy -import pytest - -from openfisca_core.entities import Entity -from openfisca_core.holders import Holder -from openfisca_core.periods import DateUnit, period -from openfisca_core.populations import Population -from openfisca_core.variables import Variable - -# --------------------------------------------------------------------------- -# Local helpers -# --------------------------------------------------------------------------- - -_entity = Entity("person", "persons", "", "") - - -class _AsOfVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - -def _make_holder(count: int) -> Holder: - pop = Population(_entity) - pop.simulation = None - pop.count = count - return Holder(_AsOfVar(), pop) - - -def _populate(holder: Holder, n_patches: int, change_rate: float, rng) -> None: - """Set a base array then *n_patches* incremental updates. - - Each patch randomly changes *change_rate* fraction of individuals. - """ - n = holder.population.count - base = rng.integers(0, 10, size=n).astype(numpy.int32) - holder.set_input("2020-01", base) - - current = base.copy() - for p in range(1, n_patches + 1): - month = f"2020-{p + 1:02d}" if p < 12 else f"{2020 + p // 12}-{p % 12 + 1:02d}" - k = max(1, int(n * change_rate)) - idx = rng.choice(n, size=k, replace=False) - current = current.copy() - current[idx] = rng.integers(0, 10, size=k).astype(numpy.int32) - holder.set_input(month, current) - - -def _patch_memory(holder: Holder) -> int: - """Return bytes used by sparse storage (base + all patches).""" - base_bytes = holder._as_of_base.nbytes - patch_bytes = sum( - idx.nbytes + vals.nbytes for _, idx, vals in holder._as_of_patches - ) - return base_bytes + patch_bytes - - -def _dense_memory(n: int, dtype, n_periods: int) -> int: - """Return bytes that dense storage (1 full array per period) would use.""" - return n * numpy.dtype(dtype).itemsize * n_periods - - -# --------------------------------------------------------------------------- -# Memory benchmarks -# --------------------------------------------------------------------------- - - -class TestAsOfMemory: - """Compare sparse storage vs hypothetical dense storage.""" - - @pytest.mark.parametrize( - "n,n_patches,change_rate", - [ - (10_000, 30, 0.005), - (100_000, 30, 0.005), - (1_000_000, 30, 0.005), - ], - ids=["10K-30p-0.5%", "100K-30p-0.5%", "1M-30p-0.5%"], - ) - def test_memory_dense_vs_patches(self, n, n_patches, change_rate, capsys): - rng = numpy.random.default_rng(42) - holder = _make_holder(n) - _populate(holder, n_patches, change_rate, rng) - - sparse = _patch_memory(holder) - dense = _dense_memory(n, holder._as_of_base.dtype, n_patches + 1) - ratio = dense / sparse - - with capsys.disabled(): - print( # noqa: T201 - f"\n N={n:>9,} patches={n_patches} r={change_rate:.1%}" - f" sparse={sparse / 1e6:.2f} Mo" - f" dense={dense / 1e6:.2f} Mo" - f" ratio={ratio:.1f}×" - ) - - if n == 1_000_000: - assert ( - ratio > 10 - ), f"Expected >10× memory gain for N=1M, r=0.5%, P=30; got {ratio:.1f}×" - - -# --------------------------------------------------------------------------- -# Compute benchmarks -# --------------------------------------------------------------------------- - - -class TestAsOfCompute: - """Measure GET performance for sequential and backward-jump access.""" - - N = 1_000_000 - N_PATCHES = 30 - CHANGE_RATE = 0.005 - - @pytest.fixture(autouse=True) - def _holder(self): - rng = numpy.random.default_rng(42) - self.holder = _make_holder(self.N) - _populate(self.holder, self.N_PATCHES, self.CHANGE_RATE, rng) - # Build the list of period strings that were stored - self.periods = ["2020-01"] - for _p in range(1, self.N_PATCHES + 1): - month = ( - f"2020-{_p + 1:02d}" - if _p < 12 - else f"{2020 + _p // 12}-{_p % 12 + 1:02d}" - ) - self.periods.append(month) - - def test_get_sequential(self, benchmark): - """360 sequential GETs on 1M persons with 30 patches (snapshot cursor).""" - holder = self.holder - periods_objs = [period(p) for p in self.periods] - # Extend to 360 periods by repeating the last period - while len(periods_objs) < 360: - periods_objs.append(periods_objs[-1]) - - def _run(): - holder._as_of_snapshots.clear() # reset LRU cache for fair comparison - for p in periods_objs: - holder.get_array(p) - - benchmark.pedantic(_run, rounds=5, iterations=1) - - def test_get_sequential_with_snapshot(self, benchmark): - """360 sequential GETs on 1M persons — snapshot cursor warmed up.""" - holder = self.holder - periods_objs = [period(p) for p in self.periods] - while len(periods_objs) < 360: - periods_objs.append(periods_objs[-1]) - - # Warm up snapshot at the start - holder.get_array(periods_objs[0]) - - def _run(): - for p in periods_objs: - holder.get_array(p) - - benchmark.pedantic(_run, rounds=5, iterations=1) - - def test_get_backward_jump(self, benchmark): - """GET at last period then GET at first period (backward jump = O(N+k×P)).""" - holder = self.holder - first = period(self.periods[0]) - last = period(self.periods[-1]) - - def _run(): - holder.get_array(last) # forward → builds snapshot - holder.get_array(first) # backward → full reconstruction - - benchmark.pedantic(_run, rounds=5, iterations=1) - - -# --------------------------------------------------------------------------- -# Forward-simulation benchmark (real use case) -# --------------------------------------------------------------------------- - - -class TestForwardSimulationBench: - """Model the real use case: month-by-month simulation. - - Pattern: GET(M-1) → apply rule → SET(M) → GET(M) → apply rule → SET(M+1)... - - Each echelon at month M depends on echelon at month M-1 plus a stochastic - transition (some fraction of persons change state each month). - """ - - N = 1_000_000 - - @pytest.mark.parametrize( - "n_months,change_rate", - [ - (12, 0.10), - (60, 0.10), - (60, 0.30), - ], - ids=["1yr-10%", "5yr-10%", "5yr-30%"], - ) - def test_forward_simulation(self, benchmark, n_months, change_rate): - """Forward GET→SET simulation over n_months on 1M persons.""" - N = self.N - rng = numpy.random.default_rng(42) - - # Pre-generate all random transitions (excludes RNG cost from timing) - k = max(1, int(N * change_rate)) - all_idx = [rng.choice(N, size=k, replace=False) for _ in range(n_months)] - all_vals = [ - rng.integers(0, 10, size=k).astype(numpy.int32) for _ in range(n_months) - ] - - base = rng.integers(0, 10, size=N).astype(numpy.int32) - months = ["2020-01"] + [ - f"{2020 + m // 12}-{m % 12 + 1:02d}" for m in range(1, n_months + 1) - ] - - def _run(): - h = _make_holder(N) - h.set_input(months[0], base.copy()) - for m in range(1, n_months + 1): - h.set_input_sparse(months[m], all_idx[m - 1], all_vals[m - 1]) - - benchmark.pedantic(_run, rounds=3, iterations=1) - - -# --------------------------------------------------------------------------- -# set_input_sparse vs set_input comparison -# --------------------------------------------------------------------------- - - -class TestSetInputSparseVsDense: - """Compare set_input (dense O(N) diff) vs set_input_sparse (O(k) + O(N) snapshot). - - Run with: - .venv/bin/pytest benchmarks/test_bench_asof.py -v --benchmark-sort=name -k "sparse" - """ - - N = 1_000_000 - - @pytest.mark.parametrize( - "n_months,change_rate", - [ - (12, 0.10), - (60, 0.10), - (60, 0.30), - ], - ids=["1yr-10%", "5yr-10%", "5yr-30%"], - ) - def test_dense(self, benchmark, n_months, change_rate): - """Forward simulation using set_input — O(N) diff + copy per SET.""" - N = self.N - rng = numpy.random.default_rng(42) - k = max(1, int(N * change_rate)) - all_idx = [rng.choice(N, size=k, replace=False) for _ in range(n_months)] - all_vals = [ - rng.integers(0, 10, size=k).astype(numpy.int32) for _ in range(n_months) - ] - base = rng.integers(0, 10, size=N).astype(numpy.int32) - months = ["2020-01"] + [ - f"{2020 + m // 12}-{m % 12 + 1:02d}" for m in range(1, n_months + 1) - ] - months_periods = [period(m) for m in months] - - def _run(): - h = _make_holder(N) - h.set_input(months[0], base.copy()) - for m in range(1, n_months + 1): - prev = h.get_array(months_periods[m - 1]) - new_val = prev.copy() - new_val[all_idx[m - 1]] = all_vals[m - 1] - h.set_input(months[m], new_val) - - benchmark.pedantic(_run, rounds=3, iterations=1) - - @pytest.mark.parametrize( - "n_months,change_rate", - [ - (12, 0.10), - (60, 0.10), - (60, 0.30), - ], - ids=["1yr-10%", "5yr-10%", "5yr-30%"], - ) - def test_sparse(self, benchmark, n_months, change_rate): - """Forward simulation using set_input_sparse — skips O(N) diff entirely.""" - N = self.N - rng = numpy.random.default_rng(42) - k = max(1, int(N * change_rate)) - all_idx = [rng.choice(N, size=k, replace=False) for _ in range(n_months)] - all_vals = [ - rng.integers(0, 10, size=k).astype(numpy.int32) for _ in range(n_months) - ] - base = rng.integers(0, 10, size=N).astype(numpy.int32) - months = ["2020-01"] + [ - f"{2020 + m // 12}-{m % 12 + 1:02d}" for m in range(1, n_months + 1) - ] - - def _run(): - h = _make_holder(N) - h.set_input(months[0], base.copy()) - for m in range(1, n_months + 1): - h.set_input_sparse(months[m], all_idx[m - 1], all_vals[m - 1]) - - benchmark.pedantic(_run, rounds=3, iterations=1) diff --git a/benchmarks/test_bench_compute.py b/benchmarks/test_bench_compute.py deleted file mode 100644 index 5a8ad7198..000000000 --- a/benchmarks/test_bench_compute.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Compute time benchmarks for OpenFisca-Core. - -Uses pytest-benchmark for statistically rigorous measurements. -Run with: pytest benchmarks/test_bench_compute.py -v --benchmark-sort=name -""" - -import pytest - -# --------------------------------------------------------------------------- -# S1: members_position (the function we just vectorized) -# --------------------------------------------------------------------------- - - -class TestMembersPositionBench: - """Benchmark GroupPopulation.members_position.""" - - @pytest.mark.parametrize( - "nb_persons,nb_entities", - [ - pytest.param(100, 40, id="N=100"), - pytest.param(10_000, 4_000, id="N=10K"), - pytest.param(100_000, 40_000, id="N=100K"), - pytest.param(1_000_000, 400_000, id="N=1M"), - ], - ) - def test_members_position( - self, benchmark, nb_persons, nb_entities, make_group_population - ): - pop = make_group_population(nb_persons, nb_entities) - - def run(): - pop._members_position = None # force recompute - return pop.members_position - - result = benchmark.pedantic(run, iterations=3, rounds=5, warmup_rounds=1) - assert len(result) == nb_persons - - -# --------------------------------------------------------------------------- -# S2: GroupPopulation aggregations (sum, any) -# --------------------------------------------------------------------------- - - -class TestGroupAggregationBench: - """Benchmark household.sum() and household.any().""" - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(10_000, id="N=10K"), - pytest.param(100_000, id="N=100K"), - ], - ) - def test_household_sum(self, benchmark, nb_persons, make_simulation): - sim = make_simulation(nb_persons) - - def run(): - household = sim.populations["household"] - salaries = household.members("salary", "2024-01") - return household.sum(salaries) - - result = benchmark.pedantic(run, iterations=5, rounds=5, warmup_rounds=1) - assert len(result) > 0 - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(10_000, id="N=10K"), - pytest.param(100_000, id="N=100K"), - ], - ) - def test_household_any(self, benchmark, nb_persons, make_simulation): - sim = make_simulation(nb_persons) - - def run(): - household = sim.populations["household"] - salaries = household.members("salary", "2024-01") - return household.any(salaries > 3000) - - result = benchmark.pedantic(run, iterations=5, rounds=5, warmup_rounds=1) - assert len(result) > 0 - - -# --------------------------------------------------------------------------- -# S3: Full simulation (disposable_income) -# --------------------------------------------------------------------------- - - -class TestFullSimulationBench: - """Benchmark a full disposable_income calculation.""" - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(100, id="N=100"), - pytest.param(10_000, id="N=10K"), - pytest.param(100_000, id="N=100K"), - ], - ) - def test_disposable_income(self, benchmark, nb_persons, make_simulation): - sim = make_simulation(nb_persons) - - def run(): - return sim.calculate("disposable_income", "2024-01") - - result = benchmark.pedantic(run, iterations=1, rounds=3, warmup_rounds=1) - assert len(result) > 0 - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(100, id="N=100"), - pytest.param(10_000, id="N=10K"), - ], - ) - def test_income_tax(self, benchmark, nb_persons, make_simulation): - sim = make_simulation(nb_persons) - - def run(): - return sim.calculate("income_tax", "2024-01") - - result = benchmark.pedantic(run, iterations=3, rounds=5, warmup_rounds=1) - assert len(result) > 0 - - -# --------------------------------------------------------------------------- -# S4: TBS loading -# --------------------------------------------------------------------------- - - -class TestTBSLoadingBench: - """Benchmark TaxBenefitSystem initialization.""" - - def test_tbs_loading(self, benchmark): - def run(): - from openfisca_country_template import CountryTaxBenefitSystem - - return CountryTaxBenefitSystem() - - result = benchmark.pedantic(run, iterations=1, rounds=3, warmup_rounds=1) - assert result is not None diff --git a/benchmarks/test_bench_memory.py b/benchmarks/test_bench_memory.py deleted file mode 100644 index 59a1f1ca0..000000000 --- a/benchmarks/test_bench_memory.py +++ /dev/null @@ -1,200 +0,0 @@ -"""Memory benchmarks for OpenFisca-Core. - -Uses tracemalloc (stdlib) for peak memory measurements. -Run with: pytest benchmarks/test_bench_memory.py -v -s -(the -s flag is needed to see the printed memory reports) -""" - -import tracemalloc - -import pytest - - -def _measure_memory(func): - """Run func and return (result, current_bytes, peak_bytes).""" - tracemalloc.start() - tracemalloc.reset_peak() - result = func() - current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - return result, current, peak - - -def _fmt(nbytes): - """Format bytes as human-readable string.""" - if nbytes < 1024: - return f"{nbytes} B" - if nbytes < 1024**2: - return f"{nbytes / 1024:.1f} KB" - return f"{nbytes / 1024**2:.1f} MB" - - -# --------------------------------------------------------------------------- -# M1: members_position memory -# --------------------------------------------------------------------------- - - -class TestMembersPositionMemory: - """Measure memory for members_position computation.""" - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(10_000, id="N=10K"), - pytest.param(100_000, id="N=100K"), - pytest.param(1_000_000, id="N=1M"), - ], - ) - def test_members_position_memory(self, nb_persons, make_group_population): - pop = make_group_population(nb_persons) - - _, current, peak = _measure_memory(lambda: pop.members_position) - - per_person = peak / nb_persons - msg = ( - f"\n [members_position] N={nb_persons:>10_d}" - f" current={_fmt(current)}" - f" peak={_fmt(peak)}" - f" per_person={per_person:.0f} B" - ) - print(msg) # noqa: T201 - - # Sanity check: should not use more than 100 bytes per person - # (the arrays themselves are ~4 bytes each, but intermediates exist) - assert per_person < 100, f"Too much memory: {per_person:.0f} B/person" - - -# --------------------------------------------------------------------------- -# M2: Full simulation memory -# --------------------------------------------------------------------------- - - -class TestSimulationMemory: - """Measure memory for full simulation calculations.""" - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(10_000, id="N=10K"), - pytest.param(100_000, id="N=100K"), - ], - ) - def test_disposable_income_memory(self, nb_persons, make_simulation): - sim = make_simulation(nb_persons) - - _, current, peak = _measure_memory( - lambda: sim.calculate("disposable_income", "2024-01") - ) - - per_person = peak / nb_persons - msg = ( - f"\n [disposable_income] N={nb_persons:>10_d}" - f" current={_fmt(current)}" - f" peak={_fmt(peak)}" - f" per_person={per_person:.0f} B" - ) - print(msg) # noqa: T201 - - @pytest.mark.parametrize( - "nb_persons", - [ - pytest.param(10_000, id="N=10K"), - pytest.param(100_000, id="N=100K"), - ], - ) - def test_multi_period_memory(self, nb_persons, make_simulation): - """Measure memory growth over 12 monthly calculations.""" - sim = make_simulation(nb_persons) - - def run_12_months(): - for month in range(1, 13): - sim.calculate("disposable_income", f"2024-{month:02d}") - - _, current, peak = _measure_memory(run_12_months) - - per_person = peak / nb_persons - msg = ( - f"\n [12-month simulation] N={nb_persons:>10_d}" - f" current={_fmt(current)}" - f" peak={_fmt(peak)}" - f" per_person={per_person:.0f} B" - f" per_person_per_month={per_person / 12:.0f} B" - ) - print(msg) # noqa: T201 - - -# --------------------------------------------------------------------------- -# M3: Per-variable memory cost -# --------------------------------------------------------------------------- - - -class TestPerVariableMemory: - """Measure the incremental memory cost of calculating one more variable.""" - - def test_per_variable_cost(self, make_simulation): - nb_persons = 100_000 - sim = make_simulation(nb_persons) - - variables = [ - "salary", - "income_tax", - "social_security_contribution", - "basic_income", - "pension", - ] - - print(f"\n Per-variable memory cost (N={nb_persons:_d}):") # noqa: T201 - print( # noqa: T201 - f" {'Variable':<35s} {'Current':>10s} {'Peak':>10s} {'Marginal':>10s}" - ) - print(f" {'-' * 35} {'-' * 10} {'-' * 10} {'-' * 10}") # noqa: T201 - - prev_current = 0 - for var_name in variables: - tracemalloc.start() - tracemalloc.reset_peak() - sim.calculate(var_name, "2024-01") - current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - marginal = current - prev_current - msg = ( - f" {var_name:<35s}" - f" {_fmt(current):>10s}" - f" {_fmt(peak):>10s}" - f" {_fmt(marginal):>10s}" - ) - print(msg) # noqa: T201 - prev_current = current - - -# --------------------------------------------------------------------------- -# M4: Scaling analysis -# --------------------------------------------------------------------------- - - -class TestScalingAnalysis: - """Verify that memory scales linearly with population size.""" - - def test_memory_scales_linearly(self, make_simulation): - """Memory should roughly double when population doubles.""" - sizes = [10_000, 20_000, 40_000] - peaks = [] - - for n in sizes: - sim = make_simulation(n) - _, _, peak = _measure_memory( - lambda sim=sim: sim.calculate("income_tax", "2024-01") - ) - peaks.append(peak) - print(f"\n N={n:>6_d} peak={_fmt(peak)}") # noqa: T201 - - # Check roughly linear: ratio should be close to 2 - ratio_1 = peaks[1] / peaks[0] - ratio_2 = peaks[2] / peaks[1] - print(f"\n Ratio {sizes[1]:_}/{sizes[0]:_} = {ratio_1:.2f}x") # noqa: T201 - print(f" Ratio {sizes[2]:_}/{sizes[1]:_} = {ratio_2:.2f}x") # noqa: T201 - - # Allow tolerance for fixed overhead at small sizes - assert 1.2 < ratio_1 < 3.0, f"Non-linear scaling: {ratio_1:.2f}x" - assert 1.2 < ratio_2 < 3.0, f"Non-linear scaling: {ratio_2:.2f}x" diff --git a/docs/implementation/asof-variable.md b/docs/implementation/asof-variable.md deleted file mode 100644 index 812e7752f..000000000 --- a/docs/implementation/asof-variable.md +++ /dev/null @@ -1,1373 +0,0 @@ -# AsOfVariable — Variables à valeur persistante - -## Concept - -Une `AsOfVariable` est une variable dont la valeur, une fois fixée à un -**instant**, persiste dans le temps jusqu'à ce qu'elle soit explicitement -modifiée. C'est l'analogue vectoriel des paramètres OpenFisca. - -``` -Comparison des sémantiques : - -Variable classique (MONTH) : - 2024-01: [OWNER, TENANT] ← renseigné - 2024-02: [default, default] ← non renseigné → valeur par défaut ! - 2024-03: [default, default] - -AsOfVariable (MONTH, as_of=True) : - Instant 2024-01-01: [OWNER, TENANT] ← renseigné - 2024-02: [OWNER, TENANT] ← persiste automatiquement - Instant 2024-04-15: personne 1 → TENANT ← changement - 2024-05: [TENANT, TENANT] ← nouvelle valeur persiste -``` - -### Orthogonalité avec definition_period - -`as_of` porte sur un **instant** (quand la valeur change). -`definition_period` porte sur une **période** (granularité de calcul). -Les deux sont indépendants : - -```python -class housing_occupancy_status(Variable): - value_type = Enum - possible_values = HousingOccupancyStatus - entity = Household - definition_period = MONTH # utilisable dans des formulas mensuelles - as_of = True # valeur ancrée à un instant, persiste -``` - -| | Paramètre | Variable | AsOfVariable | -|---|---|---|---| -| Valeur | Scalaire | Vecteur (N,) | Vecteur (N,) | -| Indexé par | Instant | Période | **Instant** | -| Persiste | ✅ | ❌ (default) | ✅ | -| definition_period | — | MONTH/YEAR | MONTH/YEAR | -| A une formule | Non | Possible | Possible | - -## Analyse du code existent - -### Flux actuel : `Simulation._calculate` → `Holder.get_array` - -``` -Simulation._calculate(variable_name, period) - → holder.get_array(period) - → self._memory_storage.get(period) # dict lookup exact: _arrays[period] - → retourne None si pas trouvé - → si None: lance la formule ou retourne default_array -``` - -Le point de branchement est `Holder.get_array` (ligne 83 de holder.py). -Actuellement, c'est un lookup exact par période dans un dict. Pour une -AsOfVariable, il faut un lookup "le plus récent ≤ période demandée". - -### Flux d'écriture : `set_input` → `_set` → `InMemoryStorage.put` - -``` -Holder.set_input(period, array) - → Holder._set(period, value) - → self._memory_storage.put(value, period) # _arrays[period] = value -``` - -Pour une AsOfVariable, le `set_input` reçoit une valeur à un instant -(= début de période). Le stockage est le même, mais le GET change. - -## Implémentation proposée - -### Niveau 1 : Lookup "as of" dans get_array (~20 lignes) - -Modifier `Holder.get_array` pour chercher la période la plus récente -quand la variable est `as_of` : - -```python -# holder.py, dans Holder: - -def __init__(self, variable, population): - # ... existent ... - self._as_of = getattr(self.variable, 'as_of', False) - -def get_array(self, period): - if self.variable.is_neutralized: - return self.default_array() - - value = self._memory_storage.get(period) - if value is not None: - return value - - # NOUVEAU : pour les AsOfVariable, chercher la valeur la plus récente - if self._as_of: - value = self._get_as_of(period) - if value is not None: - return value - - if self._disk_storage: - return self._disk_storage.get(period) - return None - -def _get_as_of(self, period): - """Find the most recent stored value at or before period.start.""" - target = period.start - best_period = None - best_start = None - - for known_period in self._memory_storage.get_known_periods(): - start = known_period.start - if start <= target: - if best_start is None or start > best_start: - best_start = start - best_period = known_period - - if best_period is not None: - return self._memory_storage.get(best_period) - return None -``` - -### Attribute sur Variable (~5 lignes) - -```python -# variable.py, dans Variable.__init__: - -self.as_of = self.set( - attr, - "as_of", - required=False, - default=False, - allowed_type=bool, -) -``` - -### Niveau 2 : Optimisation du lookup O(1) avec bisect (~30 lignes) - -Le lookup linéaire O(P) dans `_get_as_of` peut être optimisé en -maintenant une liste triée d'instants : - -```python -import bisect - -class AsOfMixin: - def __init__(self): - self._sorted_instants = [] # maintenu trié - - def _register_instant(self, period): - instant = period.start - pos = bisect.bisect_right(self._sorted_instants, instant) - if pos == 0 or self._sorted_instants[pos - 1] != instant: - self._sorted_instants.insert(pos, instant) - - def _get_as_of(self, period): - target = period.start - pos = bisect.bisect_right(self._sorted_instants, target) - if pos > 0: - best_instant = self._sorted_instants[pos - 1] - # Retrouver la période qui a cet instant comme start - for known_period in self._memory_storage.get_known_periods(): - if known_period.start == best_instant: - return self._memory_storage.get(known_period) - return None -``` - -### Niveau 3 : Économie mémoire avec reference sharing (~15 lignes en plus) - -Quand une AsOfVariable est renseignée mais que la valeur n'a pas changé -par rapport à l'instant précédent, on réutilise le **même object** Python : - -```python -def _set_as_of(self, period, value): - """Set input with reference sharing for unchanged arrays.""" - prev = self._get_as_of(period) - if prev is not None and numpy.array_equal(value, prev): - # La valeur n'a pas changé → pointer vers le même object - self._memory_storage.put(prev, period) - else: - self._memory_storage.put(value.copy(), period) - self._register_instant(period) -``` - -## Compatibilité vectorielle - -### Aucun impact sur les formulas - -Le contrat est : `holder.get_array(period)` retourne toujours un -`ndarray` dense de shape `(N,)`. Les formulas ne savent pas si la valeur -vient d'un lookup exact ou d'un lookup "as of" : - -```python -# Cette formule fonctionne IDENTIQUEMENT pour Variable et AsOfVariable -def formula(household, period): - status = household.members("housing_occupancy_status", period) - # status est un ndarray (N,), comme toujours - return household.any(status == HousingOccupancyStatus.OWNER) -``` - -### Aucun impact sur les agrégations - -`household.sum()`, `household.any()`, `bincount`, fancy indexing — tout -fonctionne car l'opérande est un ndarray dense standard. - -### Le seul coût - -La matérialisation au GET pour le niveau 3 (patches) coûte O(N) pour la -copie. Mais les niveaux 1-2 ne copient pas (ils retournent une référence -vers l'array stocké). - -**Attention** : si une formule modifie l'array retourné in-place, elle -corromprait aussi les autres périodes qui pointent vers le même object. -Deux options : -- Retourner une copie défensive (CPU cost) -- Documenter que les arrays retournés sont en lecture seule - -## Examples de variables candidates - -| Variable | definition_period | Fréquence de changement | Gain mémoire | -|---|---|---|---| -| `housing_occupancy_status` | MONTH | ~0.5%/mois | ~25× | -| `marital_status` | MONTH | ~0.1%/mois | ~50× | -| `employer_id` | MONTH | ~2%/mois | ~10× | -| `region_code` | MONTH | ~0.3%/mois | ~30× | -| `birth` | ETERNITY | jamais | déjà ETERNITY | -| `salary` | MONTH | ~5%/mois | ~3× (peu d'intérêt) | - -## Plan d'implémentation - -| Phase | Quoi | Effort | Risque | -|---|---|---|---| -| 1 | Attribute `as_of=True` sur Variable | 0.5 jour | Nul | -| 2 | `_get_as_of` dans Holder.get_array | 1 jour | Faible | -| 3 | Tests unitaires (AsOf lookup, edge cases) | 0.5 jour | Nul | -| 4 | Optimisation bisect (si P > 50) | 0.5 jour | Nul | -| 5 | Reference sharing au set_input | 0.5 jour | Faible | -| 6 | country-template: marquer les variables candidates | 0.5 jour | Nul | -| **Total** | | **~3.5 jours** | **Faible** | - -## Risques - -1. **Mutation in-place** : si une formule fait `result += 1` sur l'array - retourné, elle modifie aussi les autres périodes. Mitigation : copie - défensive ou `ndarray.flags.writeable = False`. - -2. **Interaction avec set_input_divide_by_period** : ces helpers - répartissent un input sur plusieurs sous-périodes. Pour une AsOfVariable, - cette répartition n'a pas de sens. Il faut vérifier la cohérence. - -3. **Interaction avec OnDiskStorage** : le lookup "as of" doit aussi - marcher pour le stockage disque. Probablement pas critique en v1. - -## Ce qui a été implémenté — v44.3.0 - -[PR #1365](https://github.com/openfisca/openfisca-core/pull/1365) - -### Variable.as_of - -- Attribut `as_of` ajouté dans `Variable.__init__` via le mécanisme - `set()` standard (fichier `openfisca_core/variables/variable.py`). -- Setter `set_as_of()` : normalise `True → "start"`, accepte - `"start"` / `"end"`, rejette toute autre valeur avec `ValueError`. -- Guard : si `as_of` et un helper `set_input` (ex. - `set_input_divide_by_period`) sont tous deux déclarés → `ValueError` - à l'instantiation (incompatibilité sémantique). - -### Holder (fichier `openfisca_core/holders/holder.py`) - -- `__init__` : cache `self._as_of` et initialise `self._sorted_instants` - (liste triée d'instants, uniquement si `_as_of` est actif). -- `get_array` : si le lookup mémoire exact échoue et que `_as_of` est - actif, appelle `_get_as_of(period)` avant de tenter le disk storage. -- `_get_as_of(period)` : lookup O(log P) via `bisect.bisect_right` sur - `_sorted_instants`, avec fallback linéaire pour les holders clonés - dont `_sorted_instants` n'a pas encore été reconstruit. -- `_register_instant(period)` : insertion triée sans doublon dans - `_sorted_instants`, appelée à chaque `_set`. -- `_set` : pour les variables `as_of`, calcule si la valeur a changé - par rapport à l'état précédent. Si inchangée : reference sharing - (même objet Python réutilisé). Si changée : copie défensive + - `array.flags.writeable = False` pour protéger le stockage. -- `clone()` : copie `_sorted_instants` indépendamment pour éviter la - corruption entre la simulation originale et sa copie. - -### Risques mitigés - -| Risque | Statut | -|---|---| -| Mutation in-place | ✅ `writeable = False` sur tous les arrays stockés | -| Interaction `set_input_divide_by_period` | ✅ `ValueError` au chargement | -| OnDiskStorage | ⚠️ Non traité — le lookup `_get_as_of` n'interroge pas le disk storage | - -### Tests - -18 tests dans `tests/core/test_asof_variable.py`, sans dépendance à -`openfisca-country-template` : - -| Test | Ce qu'il vérifie | -|---|---| -| `test_asof_persists_forward` | Valeur de jan retrouvée en fév, mar, déc | -| `test_asof_no_value_before_first_stored` | `None` si rien posé avant le target | -| `test_asof_exact_match_returns_stored_value` | Lookup exact passe par le chemin rapide | -| `test_asof_takes_most_recent_value` | La valeur la plus récente ≤ target est retournée | -| `test_asof_convention_start` | Valeur mi-année invisible pour une période antérieure | -| `test_asof_convention_end` | Valeur mi-année visible pour un YEAR avec `as_of="end"` | -| `test_non_asof_variable_unaffected` | Variables sans `as_of` restent inchangées | -| `test_asof_no_patch_when_value_unchanged` | Aucun patch stocké si la valeur ne change pas | -| `test_asof_patch_stores_only_changed_indices` | Un patch ne stocke que les indices/valeurs modifiés | -| `test_asof_retroactive_patch` | Un `set_input` rétroactif est reflété dans tous les GETs ultérieurs | -| `test_asof_snapshot_cursor_no_copy_between_patches` | GETs séquentiels sans patch entre eux réutilisent le même array | -| `test_asof_base_array_is_read_only` | Le tableau de base a `writeable=False` | -| `test_asof_get_array_returns_read_only` | `get_array` retourne toujours un array read-only | -| `test_asof_setting_value_does_not_mutate_caller_array` | Le tableau du caller reste modifiable | -| `test_as_of_true_normalises_to_start` | `True` → `"start"` | -| `test_as_of_false_default` | Absence de `as_of` → `False` | -| `test_as_of_invalid_value_raises` | Valeur inconnue → `ValueError` | -| `test_as_of_with_set_input_helper_raises` | `as_of` + helper dispatch → `ValueError` | - -### Ce qui a été implémenté — v44.4.0 - -> PR #1366 (branche `feat/as-of-patches`) — remplace le stockage dense de -> v44.3.0 par un stockage sparse (base + patches) avec snapshot cursor. - -#### Holder : nouveaux attributs - -| Attribut | Type | Description | -|---|---|---| -| `_as_of_base` | `ndarray` (read-only) | Premier tableau complet posé via `set_input` | -| `_as_of_base_instant` | `Instant` | Instant auquel la base a été établie | -| `_as_of_patches` | `list[(Instant, ndarray[int32], ndarray[dtype])]` | Diffs successifs triés par instant | -| `_as_of_patch_instants` | `list[Instant]` | Liste parallèle à `_as_of_patches` pour `bisect` | -| `_as_of_snapshot` | `(Instant, ndarray, int) \| None` | Cursor cache : `(instant, array, last_patch_idx)` | - -#### Méthodes - -- **`_set_as_of(period, value)`** : 1er appel → base read-only ; appels - suivants → diff sparse via `numpy.where`. Les patches sont insérés en - position triée (via `bisect`) pour gérer les `set_input` rétroactifs. - Le snapshot est invalidé si un patch rétroactif couvre son instant. -- **`_reconstruct_at(target_instant)`** : bisect + snapshot cursor. - - O(1) cache hit si `snap_instant == target_instant`. - - O(k) forward si `snap_instant < target_instant` et snapshot valide - (`snap_patch_idx <= last_patch_idx`). - - O(N + k×P) backward (ou premier accès) : recalcul depuis la base. - - Retourne `None` si aucune base ou `target_instant < base_instant`. -- **`_get_as_of(period)`** : thin wrapper → `_reconstruct_at(period.start)` - ou `period.stop` selon la convention `as_of`. -- **`_set`** : route vers `_set_as_of` si `self._as_of`, court-circuite - `_memory_storage` et le disk storage. -- **`get_array`** : court-circuite vers `_get_as_of` si `self._as_of`. -- **`clone`** : `_as_of_base` partagée (read-only), listes de patches - copiées indépendamment (les arrays idx/vals internes sont read-only), - snapshot réinitialisé à `None`. - -#### Risques mitigés (mis à jour) - -| Risque | Mitigation | -|---|---| -| Mutation in-place | ✅ `_as_of_base.flags.writeable = False` ; arrays retournés par `get_array` sont read-only | -| Retroactive `set_input` | ✅ Patches insérés en position triée ; snapshot invalidé si `snap_instant >= instant` | -| Interaction `set_input_divide_by_period` | ✅ `ValueError` au chargement | -| OnDiskStorage | ⚠️ Non traité — `_get_as_of` n'interroge pas le disk storage | - -#### Ce qui n'est PAS encore implémenté - -- Multi-snapshot LRU (un seul snapshot curseur maintenu par holder) -- Garbage collection de patches anciens -- Stratégie de stockage configurable par variable (`storage_strategy`, - `max_snapshots`) - -## Pourquoi le stockage dense était insuffisant (résolu en v44.4.0) - -### Le problème fondamental - -L'optimisation de reference sharing n'est utile que si -`numpy.array_equal(new_value, prev_value)` est `True` — c'est-à-dire -quand **aucun individu** n'a changé de valeur. Pour les variables dont -les changements sont dispersés dans la population (chaque mois, quelques -centaines ou milliers de personnes changent, mais pas les mêmes), cette -condition n'est quasiment jamais vraie. - -**Scénario concret** : 100 000 fonctionnaires, variable `echelon` -(int16), 0,5–2 % changent par mois de façon aléatoire et indépendante, -simulation sur 10 ans (120 mois). - -``` -Mois 1 : [3, 5, 2, 7, 1, ...] ← 1 tableau plein stocké -Mois 2 : [3, 5, 2, 8, 1, ...] ← 1 seul changement (personne #4) - ^--- mais numpy.array_equal → False → NOUVEAU tableau plein stocké -Mois 3 : [3, 5, 2, 8, 2, ...] ← 1 seul changement (personne #5) - ^--- encore un tableau plein -... -Mois 120: ... ← 120 tableaux pleins au total -``` - -L'optimisation reference sharing du `_set` ne détecte pas les -changements partiels : elle compare les tableaux entiers. - -### Chiffrage mémoire - -| Stratégie | N = 100K | N = 1M | Remarque | -|---|---|---|---| -| **Dense v44.3.0** (1 `set_input`/mois) | 120 × 200 Ko = **24 Mo** | 120 × 2 Mo = **240 Mo** | Reference sharing = 0% gain | -| **Patches sparse** | base 200 Ko + 120 × ~2 Ko = **~0,4 Mo** | base 2 Mo + 120 × ~20 Ko = **~4 Mo** | ×60 moins de mémoire | - -Le gain réel dépend du taux de changement mensuel `r` : - -``` -Gain mémoire ≈ 1 / (2r + 1/P) où P = nombre de mois, r = taux de changement - -r = 0,5%, P = 120 → gain ≈ ×40 -r = 2%, P = 120 → gain ≈ ×12 -r = 10%, P = 120 → gain ≈ ×3 (l'AsOf perd de son intérêt) -r = 50%, P = 120 → gain ≈ ×1 (aussi bien de rester en dense) -``` - -### Pourquoi la solution est les patches sparse - -La vraie économie mémoire nécessite de ne stocker que le **diff** à -chaque `set_input`. L'API externe reste identique ; c'est le stockage -interne qui change : - -```python -# Au lieu de : -_memory_storage.put(full_array, period) - -# On stocke : -_as_of_base : ndarray # tableau initial — 1 seule copie (N) -_as_of_patches : list[ # diffs successifs - (Instant, indices: ndarray[int32], values: ndarray[dtype]) -] -``` - -À chaque `set_input(period, full_array)` : - -```python -prev = _reconstruct_as_of(period.start) # état précédent -changed = full_array != prev -if not changed.any(): - return # rien n'a changé, aucun stockage -idx = numpy.where(changed)[0].astype(numpy.int32) -vals = full_array[idx].copy() -_as_of_patches.append((period.start, idx, vals)) # seulement ~k×8 octets -``` - -À chaque `get_array(period)` : - -```python -result = _as_of_base.copy() # copie O(N) -for instant, idx, vals in _as_of_patches: - if instant <= target: result[idx] = vals # scatter O(k) - else: break -return result -``` - -Avec le **snapshot cursor** (une seule copie dense gardée en cache), -les accès séquentiels jan→fév→mar deviennent incrémentaux O(k) au -lieu de O(N + k×P) à chaque GET. Voir la section -"Stratégie de cache intelligent (snapshot cursor)" pour les détails. - -### Ce que ça change par rapport à v44.3.0 - -| | v44.3.0 (dense) | v44.4.0 (patches) ✅ | -|---|---|---| -| `_memory_storage` pour `as_of` | Utilisé (tableau plein) | Inutilisé — remplacé par `_as_of_base` + `_as_of_patches` | -| Mémoire par `set_input` | O(N) | O(k) — k = nb individus changeant | -| Temps GET séquentiel | O(1) | O(k) avec snapshot cursor | -| Temps GET premier accès | O(1) | O(N) | -| Reference sharing utile | Seulement si 0 changement | Toujours (patches vides non stockés) | -| API externe | — | Identique (`set_input`, `get_array`) | -| Tests adaptés | — | `test_asof_reference_sharing` supprimé, 6 nouveaux tests ajoutés | - -Ce problème est résolu en v44.4.0 — voir section précédente. - -## Convention start / end - -Quand une formule demande une AsOfVariable pour une **période** (pas un -instant), il faut choisir quel instant de la période sert de référence : - -``` -Période demandée : "2024" (YEAR) - ├── start = 2024-01-01 - └── end = 2024-12-31 - -marital_status posé à l'instant 2024-06-15 → MARRIED - -Si as_of = "start" : lookup ≤ 2024-01-01 → SINGLE (valeur précédente) -Si as_of = "end" : lookup ≤ 2024-12-31 → MARRIED -``` - -| Convention | Usage | Exemple | -|---|---|---| -| `as_of = "start"` | Valeur au début de la période | RSA : statut au 1er du mois | -| `as_of = "end"` | Valeur à la fin de la période | IR : situation au 31 décembre | - -Défaut recommandé : `"start"` (cohérent avec le fonctionnement actuel, -cas le plus fréquent pour les droits sociaux mensuels). - -## Interaction formule + as_of - -### Cas normal (recommandé) : as_of sans formula - -La variable est purement input-driven. Seules les valeurs posées via -`set_input` sont stockées. `get_array` retourne le dernier état sparse -reconstruit ; aucune formula n'est jamais invoquée. - -### Cas avec formula - -La formula s'exécute **uniquement** pour le premier appel dont -`holder.get_array(period)` retourne `None` (base pas encore établie, -ou accès avant la base). Une fois la base établie, tous les GETs -suivants retournent la valeur persistée → **la formula n'est plus -jamais rappelée**. Ce comportement est intentionnel. - -Mécanisme interne : - -```python -# Dans Simulation._calculate (simplifié) : -cached_array = holder.get_array(period) # appelle _get_as_of -if cached_array is not None: - return cached_array # ← as_of: retourne dès que la base existe -# Sinon, exécuter la formula… -result = formula(population, period) -holder.put_in_cache(result, period) # → _set_as_of → établit la base -``` - -`_reconstruct_at` retourne un résultat dès que la base existe et que -`target_instant >= base_instant`. La formula n'est donc appelée qu'une -seule fois (à la première période demandée). - -### Cas limite / avertissement - -Si la formula est appelée pour une période antérieure à la base -existante (accès non-séquentiel sans `set_input` préalable), -`_reconstruct_at` retourne `None` et la formula s'exécute à nouveau. -Dans ce cas, l'appel à `_set_as_of` dans `put_in_cache` compare la -nouvelle valeur à l'état reconstruit à cet instant (qui est `None` → -la valeur entière est comparée à `None` → numpy lève une exception). - -**Mitigation** : poser toujours un `set_input` pour la période initiale -avant de calculer une as_of variable avec formula, ou s'assurer d'un -accès séquentiel (chronologique). - -### Incompatibilité explicite - -`as_of` combiné avec un `set_input` helper (ex. -`set_input_divide_by_period`) lève une `ValueError` dès l'instantiation -de la variable. Voir `test_as_of_with_set_input_helper_raises`. - -## Outil de benchmark - -Le script `benchmarks/test_bench_asof.py` mesure la mémoire et le temps -de la feature as_of avec stockage sparse. - -### Lancement - -```bash -# Afficher les résultats mémoire (avec -s pour voir les prints) -.venv/bin/pytest benchmarks/test_bench_asof.py -v -s -k memory - -# Afficher les temps de GET (benchmark compute) -.venv/bin/pytest benchmarks/test_bench_asof.py -v --benchmark-sort=name -k compute -``` - -### Ce qu'il mesure - -- **Mémoire** : `_as_of_base.nbytes + Σ(idx.nbytes + vals.nbytes)` vs - ce que le stockage dense aurait coûté (`N × dtype.itemsize × P`). - Assert que le gain est > 10× pour N=1M, r=0.5%, P=30. -- **Temps GET séquentiel** : 360 GETs consécutifs sur 1M personnes, - 30 patches. Mesure le coût amorti du snapshot cursor. -- **Temps GET backward** : GET au dernier mois puis au premier mois - (backward jump sans snapshot valide), pour mesurer le coût O(N+k×P). - -### Interprétation - -| Résultat | Signification | -|---|---| -| Ratio mémoire > 10× | Le sparse storage est efficace (r faible) | -| GET séquentiel ≈ O(k) par step | Le snapshot cursor fonctionne | -| GET backward ≈ O(N) | Recalcul depuis la base — attendu | - -## Benchmark : mémoire et temps avec patches - -### Protocole - -- Variable int16 (enum-like), 0.5% de changement par patch -- Chaque GET = copie du base array + application des patches -- Mesuré sur i7-1185G7, Python 3.11, numpy, médiane sur 50 runs - -### Résultats (N = 1M personnes) - -| Patches | Dense | AsOf | Gain mém. | Temps GET | Copie seule | Surcoût | -|---|---|---|---|---|---|---| -| 1 | 4.0 Mo | 2.1 Mo | 2× | 0.09 ms | 0.07 ms | +18% | -| 3 | 8.0 Mo | 2.2 Mo | 3.7× | 0.11 ms | 0.07 ms | +53% | -| 10 | 22 Mo | 2.5 Mo | 8.8× | 0.18 ms | 0.07 ms | ×2.5 | -| 30 | 62 Mo | 3.5 Mo | 17.7× | 0.36 ms | 0.07 ms | ×5 | -| 100 | 202 Mo | 7.0 Mo | 28.9× | 1.25 ms | 0.07 ms | ×17 | - -### Impact sur une simulation complète - -10 AsOf variables, 360 périodes, 1M personnes, 30 patches/variable : - -| | Dense | AsOf (patches naïfs) | -|---|---|---| -| Mémoire | 620 Mo | 35 Mo | -| Temps GET total | ~0 ms | ~130 ms | - -## Stratégie de cache intelligent (snapshot cursor) - -> ✅ Implémenté en v44.4.0 — voir `Holder._reconstruct_at` dans -> `openfisca_core/holders/holder.py`. - -### Le problème - -Avec les patches naïfs, chaque GET applique **tous** les patches depuis -la base. Pour 30 patches, c'est 30 applications par GET, même si on a -déjà calculé la valeur pour une période voisine. - -### L'idée : garder un snapshot curseur - -Quand on accède à une AsOfVariable pour une période, on **garde le -résultat dense en cache**. Au prochain accès, au lieu de repartir de la -base, on repart du **dernier snapshot** et on n'applique que les patches -**entre le snapshot et la période demandée**. - -``` -Patches : P0(base) P1 P2 P3 P4 P5 ... P29 - -GET("2024-01") : base + P0 → snapshot S₀ = résultat -GET("2024-02") : S₀ + P1 (si P1 ≤ 2024-02) → snapshot S₁ = résultat -GET("2024-03") : S₁ + (rien entre 02 et 03) → snapshot S₂ = S₁ (même ref) -GET("2024-04") : S₂ + P2 (si P2 ≤ 2024-04) → snapshot S₃ = résultat -... -``` - -### Le mécanisme (implémentation réelle) - -Le snapshot est stocké dans `self._as_of_snapshot` sous la forme d'un -tuple `(Instant, array, last_patch_idx)`. La méthode -`_reconstruct_at(target_instant)` implémente les trois cas : - -```python -def _reconstruct_at(self, target_instant): - if self._as_of_base is None or target_instant < self._as_of_base_instant: - return None - - # Nombre de patches applicables : bisect sur _as_of_patch_instants - pos = bisect.bisect_right(self._as_of_patch_instants, target_instant) - last_patch_idx = pos - 1 # -1 = seule la base s'applique - - snapshot = self._as_of_snapshot - if snapshot is not None: - snap_instant, snap_array, snap_patch_idx = snapshot - - # Cas 1 : cache hit exact — O(1) - if snap_instant == target_instant: - return snap_array - - # Cas 2 : forward access — O(k) incrémental - if snap_instant < target_instant and snap_patch_idx <= last_patch_idx: - result = snap_array - for i in range(snap_patch_idx + 1, last_patch_idx + 1): - _, idx, vals = self._as_of_patches[i] - if result is snap_array: - result = result.copy() # copy-on-write - result[idx] = vals - if result is not snap_array: - result.flags.writeable = False - self._as_of_snapshot = (target_instant, result, last_patch_idx) - return result - - # Cas 3 : backward jump ou premier accès — O(N + k×P) - result = self._as_of_base.copy() - for i in range(last_patch_idx + 1): - _, idx, vals = self._as_of_patches[i] - result[idx] = vals - result.flags.writeable = False - self._as_of_snapshot = (target_instant, result, last_patch_idx) - return result -``` - -### Analyse des coûts par type d'accès - -| Accès | Patches naïfs | Snapshot cursor | -|---|---|---| -| 1er accès | O(N) + O(k×P) | O(N) + O(k×P) ← identique | -| Suivant, avant dans le temps | O(N) + O(k×P) | **O(k) ou O(1)** ← incrémental | -| Suivant, même date | O(N) + O(k×P) | **O(1)** ← cache hit | -| Retour en arrière | O(N) + O(k×P) | O(N) + O(k×P) ← recalcul | - -### Chiffrage : 360 accès séquentiels, 30 patches, 1M personnes - -| | Patches naïfs | Snapshot cursor | -|---|---|---| -| Copies base (O(N)) | **360** | **1** | -| Applications de patches | 360 × 30 = **10 800** | **30** | -| Temps total | **~130 ms** | **~5 ms** | - -### L'arbitrage mémoire / temps - -``` - ◄── Plus de mémoire - ──► Plus rapide - -Dense ████████████████████████████ 620 Mo ~0 ms -(1 array/période) - -Snapshot cursor ████ 37 Mo ~5 ms -(base + patches - + 1 snapshot) - -Patches naïfs ███ 35 Mo ~130 ms -(base + patches - seulement) -``` - -| Stratégie | Mémoire | Temps (360 accès) | vs Dense | -|---|---|---|---| -| Dense | 620 Mo | ~0 ms | 1× | -| **Snapshot cursor** | **37 Mo** | **~5 ms** | **17× moins de mém.** | -| Patches naïfs | 35 Mo | ~130 ms | 18× moins de mém. | - -Le snapshot cursor ajoute **1 array** (~2 Mo pour 1M int16) par rapport -aux patches naïfs, mais divise le temps par **~25**. - -### Variante : multi-snapshot (LRU cache) — pas encore implémenté - -Si la simulation fait des accès non-linéaires (ex: formule qui compare -la valeur à P et P-12), on peut garder les **K derniers snapshots** : - -```python -from collections import OrderedDict - -class AsOfHolder: - def __init__(self, base, max_snapshots=3): - self._snapshots = OrderedDict() # instant → (array, patch_idx) - self._max_snapshots = max_snapshots -``` - -Pour K=3, on garde 3 snapshots × 2 Mo = 6 Mo supplémentaires, et on -couvre les patterns d'accès courants (P, P-1, P-12). - -### Résumé - -Le snapshot cursor transforme l'arbitrage mémoire/temps : - -- **Mémoire** : +1 array (~2 Mo) vs patches naïfs = +6% -- **Temps** : ÷25 (130 ms → 5 ms) pour un accès séquentiel -- **Complexité** : ~20 lignes de plus que les patches naïfs -- **Robustesse** : dégénère gracieusement vers patches naïfs si accès - aléatoire (jamais pire) - -## Garbage collection intelligent des snapshots - -### Principe - -Avec un multi-snapshot (LRU ou pas), on peut accumuler des snapshots. -Plutôt que de les garder tous, on peut les **élaguer intelligemment** -en fonction de leur coût de reconstruction. - -L'idée : un snapshot est "bon marché" à supprimer s'il est -reconstituable avec peu de patches depuis un snapshot voisin. Il est -"coûteux" à supprimer s'il faudrait repartir de la base + beaucoup -de patches pour le recréer. - -### Stratégie 1 : Éviction par coût de reconstruction - -``` -Snapshots gardés en cache : - - S₁(jan) S₂(mars) S₃(avril) S₄(juin) S₅(sept) - │ │ │ │ │ - └─ 3p ──┘ └─ 1p ─┘ └─ 2p ─┘ └─ 4p ─┘ - - Coût de reconstruction de S₃ depuis S₂ = 1 patch → BON MARCHÉ - Coût de reconstruction de S₅ depuis S₄ = 4 patches → CHER - - → Supprimer S₃ en priorité (1 seul patch pour le recréer) -``` - -```python -def _evict_cheapest_snapshot(self): - """Remove the snapshot with the lowest reconstruction cost.""" - if len(self._snapshots) <= 1: - return - - items = list(self._snapshots.items()) - min_cost = float('inf') - min_idx = None - - for i in range(1, len(items)): - prev_instant, (_, prev_patch_idx) = items[i - 1] - curr_instant, (_, curr_patch_idx) = items[i] - cost = curr_patch_idx - prev_patch_idx # nb patches entre les deux - if cost < min_cost: - min_cost = cost - min_idx = i - - if min_idx is not None: - instant_to_evict = items[min_idx][0] - del self._snapshots[instant_to_evict] -``` - -### Stratégie 2 : Horizon temporel - -La simulation advance dans le temps. Les snapshots dans le **passé -lointain** sont moins utiles que ceux proches du présent. - -``` -Temps → ──────────────────────────────────► - -Snapshots : S₁ S₂ S₃ S₄ S₅ S₆ [curseur actuel] - ▲ ▲ - passé lointain passé récent - -Règle : garder les snapshots à distance croissante dans le passé - - - curseur actuel : toujours gardé - - P-1, P-2 : gardés (accès P-1 fréquent dans les formulas) - - P-12 : gardé (comparison annuelle) - - au-delà : élaguer -``` - -```python -def _gc_by_horizon(self, current_instant): - """Keep snapshots at increasing distance in the past.""" - keep_offsets = {0, 1, 2, 12, 24} # en mois - keep_instants = set() - - for offset in keep_offsets: - target = current_instant.offset(-offset, 'month') - # Garder le snapshot le plus proche de target - best = min(self._snapshots.keys(), - key=lambda s: abs(s - target), - default=None) - if best is not None: - keep_instants.add(best) - - # Supprimer les snapshots hors du set - for instant in list(self._snapshots.keys()): - if instant not in keep_instants: - del self._snapshots[instant] -``` - -### Stratégie 3 : Budget mémoire - -On fixe un **budget** en nombre de snapshots (ou en octets). Quand le -budget est dépassé, on évince le snapshot le moins utile : - -```python -class AsOfHolder: - def __init__(self, base, memory_budget_bytes=None, max_snapshots=5): - self._base = base - self._patches = [] - self._snapshots = OrderedDict() - self._max_snapshots = max_snapshots - self._memory_budget = memory_budget_bytes or (base.nbytes * max_snapshots) - - def _maybe_gc(self): - """Evict snapshots if over budget.""" - while len(self._snapshots) > self._max_snapshots: - self._evict_cheapest_snapshot() - - def get(self, period): - result = self._get_or_compute(period) - self._maybe_gc() - return result -``` - -### Stratégie 4 : Fusion de patches - -Quand deux patches consécutifs modifient les **mêmes indices**, on -peut les fusionner en un seul (seule la dernière valeur compte) : - -``` -Avant fusion : - Patch 3 : indices=[42, 99, 200], values=[2, 1, 3] - Patch 4 : indices=[42, 150], values=[0, 2] - -Après fusion : - Patch 3+4 : indices=[42, 99, 150, 200], values=[0, 1, 2, 3] - (personne 42 → valeur du patch 4 gagne) -``` - -Cela réduit le nombre d'opérations de scatter dans le GET. - -### Tableau récapitulatif - -| Stratégie GC | Quand utiliser | Complexité | Gain | -|---|---|---|---| -| **Coût de reconstruction** | Toujours | ~15 lignes | Évince les snapshots faciles à recréer | -| **Horizon temporel** | Simulations longues (>5 ans) | ~20 lignes | Libère la mémoire du passé | -| **Budget mémoire** | Contrainte RAM forte | ~10 lignes | Plafonné, prévisible | -| **Fusion de patches** | Beaucoup de patches sur les mêmes individus | ~20 lignes | Accélère le GET | - -### L'idée générale - -Le cache de snapshots se comporte comme un **arbre de checkpoints** : - -``` -base ──P₁──P₂──[S₁]──P₃──P₄──P₅──[S₂]──P₆──...──[Sₖ]──Pₙ──[curseur] - -où S = snapshot gardé, P = patch - -Le GC supprime les S dont le coût de re-traversée des P est faible. -Le GC garde les S proches du curseur (accès probables). -Le GC fusionne les P quand c'est possible. - -Le système s'adapte automatiquement : - - Simulation courte (30 mois) : 1-2 snapshots suffisent - - Simulation longue (30 ans) : horizon + budget → 5-10 snapshots - - Variable qui change souvent : plus de snapshots gardés - - Variable qui change peu : patches seuls, pas de snapshot -``` - -## Stratégie de stockage par variable - -### Pourquoi par variable ? - -Chaque variable a un profil de changement propre. Appliquer la même -stratégie à toutes n'est pas optimal : - -| Variable | Changement/mois | Profil | Stratégie idéale | -|---|---|---|---| -| `birth` | 0% | Constante | ETERNITY (déjà fait) | -| `marital_status` | ~0.1% | Très stable | Patches, pas de snapshot | -| `housing_occupancy_status` | ~0.5% | Stable | Patches, 1 snapshot | -| `employer_id` | ~2% | Modéré | Patches + 2-3 snapshots | -| `region_code` | ~0.3% | Stable | Patches, 1 snapshot | -| `salary` | ~100% | Volatile | Dense (l'AsOf n'a pas de sens) | - -### Déclaration sur la Variable - -```python -class marital_status(Variable): - value_type = Enum - definition_period = MONTH - as_of = "start" - # Le mainteneur peut préciser la stratégie : - storage_strategy = "patches" # ou "snapshots", "dense", "auto" - max_snapshots = 0 # pas de snapshot pour cette var - -class employer_id(Variable): - value_type = int - definition_period = MONTH - as_of = "start" - storage_strategy = "snapshots" - max_snapshots = 3 # garder 3 checkpoints - -class salary(Variable): - value_type = float - definition_period = MONTH - # Pas d'as_of : change tout le temps → stockage dense classique -``` - -### Stratégie "auto" (par défaut) - -Si le mainteneur ne précise pas, le système choisit à l'initialisation -en analysant les données d'input : - -```python -def _pick_strategy(self): - """Choose storage strategy based on input data profile.""" - if not self._patches: - return "dense" # pas de patches → stockage classique - - total_changes = sum(len(idx) for _, idx, _ in self._patches) - total_cells = len(self._base) * len(self._patches) - change_rate = total_changes / total_cells - - if change_rate < 0.01: - # Moins de 1% change par patch → patches seuls, pas de snapshot - return "patches_only" - elif change_rate < 0.10: - # 1-10% → patches + quelques snapshots - return "snapshots" - else: - # Plus de 10% → l'AsOf n'est pas rentable, dense - return "dense" -``` - -### Budget mémoire global réparti entre variables - -Au lieu d'un budget par variable, on peut fixer un **budget global** -et le répartir intelligemment : - -```python -class Simulation: - def __init__(self, ..., asof_memory_budget_mb=100): - self._asof_budget = asof_memory_budget_mb * 1e6 - - def _allocate_snapshot_budget(self): - """Give more snapshot budget to variables that change often.""" - asof_vars = [h for h in self.holders.values() if h._as_of] - - # Calculer le taux de changement de chaque variable - rates = {} - for holder in asof_vars: - total = sum(len(idx) for _, idx, _ in holder._patches) - rates[holder.variable.name] = total - - total_rate = sum(rates.values()) or 1 - - # Répartir le budget proportionnellement - for holder in asof_vars: - share = rates[holder.variable.name] / total_rate - holder._max_snapshots = max(1, int( - share * self._asof_budget / holder._base.nbytes - )) -``` - -### Exemple concret - -Budget global : 100 Mo, N = 1M personnes - -| Variable | dtype | Array size | Change rate | Budget | Snapshots | -|---|---|---|---|---|---| -| `marital_status` | int16 | 2 Mo | 0.1% | 5 Mo | **2** | -| `housing_status` | int16 | 2 Mo | 0.5% | 25 Mo | **12** | -| `employer_id` | int32 | 4 Mo | 2% | 50 Mo | **12** | -| `region_code` | int16 | 2 Mo | 0.3% | 20 Mo | **10** | - -La variable qui change le plus (`employer_id`) obtient le plus de -snapshots. Cell qui change le moins (`marital_status`) en a 2, ce -qui suffit pour le curseur actuel et un point de comparison. - -### Résumé - -La stratégie par variable permet un **arbitrage fin** : - -- **Déclaratif** : le mainteneur peut préciser `storage_strategy` et - `max_snapshots` s'il connait le profil de la variable -- **Automatique** : sinon le système analyse les données et choisit -- **Global** : un budget mémoire total est réparti entre les variables - proportionnellement à leur volatilité -- **Pas de surcoût pour les variables denses** : `salary` reste en - stockage classique, le mécanisme AsOf ne s'active que quand c'est - déclaré - -## Auto-tuning par profiling run - -### Principe - -Plutôt que de deviner la bonne stratégie, on **measure** les patterns -d'accès réels en faisant un run à blanc (profiling run), puis on résout -un problème d'optimisation sous contraintes. - -``` -┌─────────────────────────────────────────────────────┐ -│ Phase 1 : Profiling run │ -│ │ -│ Simulation normal + instrumentation │ -│ → enregistre pour chaque variable AsOf : │ -│ - nb d'accès par période │ -│ - pattern d'accès (séquentiel ? sauts ?) │ -│ - taux de changement réel │ -│ - taille d'un array │ -│ │ -│ Phase 2 : Optimisation │ -│ │ -│ Entrée : profils + contraintes (budget RAM, temps) │ -│ Sortie : nb de snapshots par variable │ -│ │ -│ Phase 3 : Run optimisé │ -│ │ -│ Simulation avec les stratégies calibrées │ -└─────────────────────────────────────────────────────┘ -``` - -### Phase 1 : Instrumentation - -Ajouter un mode `profile=True` au Holder qui enregistre les accès -sans changer le comportement : - -```python -class AsOfHolder: - def __init__(self, ..., profile=False): - self._profile = profile - self._access_log = [] # [(period, timestamp), ...] - - def get(self, period): - if self._profile: - self._access_log.append((period, time.monotonic())) - return self._do_get(period) - - def get_profile(self): - """Compute access profile from the log.""" - if not self._access_log: - return None - - periods = [p for p, _ in self._access_log] - n_accesses = len(periods) - - # Directionalité : % d'accès strictement croissants - forward = sum( - 1 for i in range(1, len(periods)) - if periods[i].start > periods[i-1].start - ) - directionality = forward / max(1, n_accesses - 1) - - # Spread : combien de périodes distinctes - unique_periods = len(set(periods)) - - # Taux de réaccès : même période demandée plusieurs fois - reaccess_rate = 1 - unique_periods / n_accesses - - return { - "variable": self.variable.name, - "n_accesses": n_accesses, - "unique_periods": unique_periods, - "directionality": directionality, - "reaccess_rate": reaccess_rate, - "n_patches": len(self._patches), - "change_rate": self._compute_change_rate(), - "array_bytes": self._base.nbytes, - } -``` - -### Phase 2 : Optimisation sous contraintes - -Le profil de chaque variable donne le **coût** de chaque stratégie. -On résout ensuite : - -``` -Minimiser Σᵢ temps_get(varᵢ, snapshotsᵢ) -Sous Σᵢ mémoire(varᵢ, snapshotsᵢ) ≤ budget_RAM - snapshotsᵢ ≥ 0 -``` - -Où pour chaque variable i, avec le profil mesuré : - -```python -def cost_model(profile, n_snapshots): - """Estimate time and memory for a given number of snapshots.""" - N = profile["array_bytes"] # taille d'un array - P = profile["n_patches"] # nb de patches - A = profile["n_accesses"] # nb d'accès total - d = profile["directionality"] # % accès forward - - # Mémoire = base + patches + snapshots - patch_bytes = sum(idx.nbytes + val.nbytes for _, idx, val in patches) - memory = N + patch_bytes + n_snapshots * N - - # Temps : dépend du pattern d'accès - if n_snapshots == 0: - # Patches naïfs : chaque accès repart de la base - avg_patches_per_get = P / 2 # en moyenne la moitié - time_per_get = N + avg_patches_per_get * k_per_patch - else: - # Avec snapshots : accès forward = incrémental - avg_gap = P / (n_snapshots + 1) # patches entre 2 snapshots - forward_cost = avg_gap * k_per_patch # O(k) par patch - backward_cost = N + P / 2 * k_per_patch # recalcul total - time_per_get = d * forward_cost + (1 - d) * backward_cost - - total_time = A * time_per_get - return memory, total_time -``` - -Résolution (budget RAM fixé, minimiser le temps) : - -```python -def optimize_strategies(profiles, ram_budget): - """Find optimal snapshot count per variable under RAM constraint. - - Simple greedy: give snapshots to the variable where it saves the - most time per byte of memory used. - """ - # Commencer avec 0 snapshots pour tout le monde - snapshots = {p["variable"]: 0 for p in profiles} - used_ram = sum(p["array_bytes"] for p in profiles) # bases only - - while True: - # Pour chaque variable, calculer le gain marginal - # d'ajouter 1 snapshot - best_var = None - best_ratio = 0 # time_saved / memory_added - - for p in profiles: - var = p["variable"] - n = snapshots[var] - mem_before, time_before = cost_model(p, n) - mem_after, time_after = cost_model(p, n + 1) - - mem_delta = mem_after - mem_before # = array_bytes - time_delta = time_before - time_after - - if time_delta > 0 and used_ram + mem_delta <= ram_budget: - ratio = time_delta / mem_delta # ms saved per Mo - if ratio > best_ratio: - best_ratio = ratio - best_var = var - best_mem = mem_delta - - if best_var is None: - break # plus de budget ou plus de gain - - snapshots[best_var] += 1 - used_ram += best_mem - - return snapshots -``` - -### Phase 3 : Application - -```python -class Simulation: - def optimize_asof_strategies(self, ram_budget_mb=100): - """Run profiling, then optimize storage strategies.""" - # Phase 1 : profiling run - for holder in self._asof_holders(): - holder._profile = True - - self._run_all_formulas() # run à blanc - - # Phase 2 : optimisation - profiles = [h.get_profile() for h in self._asof_holders()] - optimal = optimize_strategies(profiles, ram_budget_mb * 1e6) - - # Phase 3 : appliquer - for holder in self._asof_holders(): - holder._max_snapshots = optimal[holder.variable.name] - holder._profile = False - holder._access_log.clear() - - return optimal -``` - -Usage : - -```python -sim = SimulationBuilder().build(tbs, scenario) -# Calibration automatique -strategies = sim.optimize_asof_strategies(ram_budget_mb=200) -# → {'marital_status': 0, 'employer_id': 5, 'housing_status': 2, ...} - -# Maintenant la simulation est optimisée -result = sim.calculate("disposable_income", "2024-01") -``` - -### Ou bien : contrainte inverse (temps fixé, minimiser la RAM) - -``` -Minimiser Σᵢ mémoire(varᵢ, snapshotsᵢ) -Sous Σᵢ temps_get(varᵢ, snapshotsᵢ) ≤ budget_temps -``` - -Même algorithme greedy en inversant le ratio : on **ajoute** des -snapshots tant que le temps est au-dessus du budget, en ciblant la -variable où 1 snapshot fait gagner le plus de temps. - -### Ce que le profiling run capte - -| Donnée mesurée | Ce qu'elle révèle | Impact sur la stratégie | -|---|---|---| -| `directionality` = 0.95 | Accès quasi-linéaire dans le temps | 1 snapshot curseur suffit | -| `directionality` = 0.5 | Accès aléatoire (sauts) | Plus de snapshots distribués | -| `reaccess_rate` = 0.8 | Même période demandée souvent | Cache LRU très efficace | -| `n_accesses` = 2 | Variable rarement lue | Pas besoin de snapshot | -| `n_accesses` = 1000 | Variable très sollicitée | Investir en snapshots | -| `change_rate` = 0.001 | Très stable | Patches seuls suffisent | -| `change_rate` = 0.1 | Volatile | Snapshots fréquents ou dense | - -### Coût du profiling : ordres de grandeur mesurés - -Mesuré avec `country-template`, N = 1M personnes, i7-1185G7 : - -| Opération | Temps | Catégorie | -|---|---|---| -| **Simulation build** | **153 ms** | Setup (le plus cher) | -| `household.sum(salary)` | 5.1 ms | Agrégation entité | -| `numpy.where(mask, a, b)` | 4.0 ms | Calcul formule | -| `set_input(salary)` | 1.5 ms | Écriture | -| `a + b` | 1.0 ms | Calcul formule | -| **AsOf GET (30 patches)** | **0.85 ms** | AsOf | -| `numpy.copy(4 Mo)` | 0.28 ms | Copie | -| `default_array` (zeros) | 0.21 ms | Allocation | -| `calculate('income_tax')` (cache hit) | 0.08 ms | Pipeline | -| **AsOf incrémental (1 patch, 5K idx)** | **0.01 ms** | AsOf (snapshot) | -| `get_array` (cache hit) | 0.01 ms | Lecture | -| **3600 × list.append** (profiling) | **0.53 ms** | Instrumentation | - -L'instrumentation de profiling (0.53 ms) est **0.06%** du temps total -d'une simulation typique (~815 ms). Le vrai coût n'est pas le -profiling, c'est de savoir **quand** profiler. - -### Quand profiler : la question de l'homogénéité - -Le profiling "en ligne" (calibrer sur les premiers accès) ne marche -que si le pattern d'accès est **homogène dans le temps**. Or ce n'est -pas toujours le cas : - -| Cas | Homogène ? | Profiling en ligne | -|---|---|---| -| Simulation mensuelle 1 an, mêmes variables chaque mois | ✅ | ✅ Marche | -| Simulation 30 ans, `pension` n'apparait qu'après 20 ans | ❌ | ❌ Rate `pension` | -| Formule avec `P-12` : pas de lookback avant le mois 13 | ❌ | ❌ Directionnalité fausse | -| Réforme qui change le graphe de dépendances | ❌ | ❌ Profil du baseline ≠ réforme | - -### Stratégies de profiling - -| Stratégie | Quand | Coût | Fiabilité | -|---|---|---|---| -| **En ligne (50 premiers accès)** | Simulation homogène | 0 | Bonne si homogène | -| **Recalibration périodique** | Simulation hétérogène | ~0 (tous les 100 accès) | Bonne | -| **Baseline = profiling** | Multi-run avec même graphe | 0 (déjà fait) | Exacte | -| **Dry run complete** | Pattern très variable | 1× temps simulation | Exacte | - -#### Recalibration périodique (recommandée) - -La plus robuste : on réajuste la stratégie tous les K accès, sans -supposer l'homogénéité : - -```python -class AsOfHolder: - RECALIBRATE_EVERY = 100 # accès - - def get(self, period): - self._access_count += 1 - self._access_log.append(period.start) - - if self._access_count % self.RECALIBRATE_EVERY == 0: - self._recalibrate() - - return self._do_get(period) - - def _recalibrate(self): - """Adjust strategy based on recent access pattern.""" - recent = self._access_log[-self.RECALIBRATE_EVERY:] - - # Mesurer la directionnalité récente - forward = sum( - 1 for i in range(1, len(recent)) - if recent[i] > recent[i-1] - ) - directionality = forward / (len(recent) - 1) - - # Si accès très directionnel → 1 snapshot curseur suffit - if directionality > 0.8: - self._max_snapshots = 1 - # Si accès aléatoire → plus de snapshots - elif directionality < 0.3: - self._max_snapshots = min(5, self._max_snapshots + 1) - # Si accès avec lookback (P et P-12) → garder 2 - else: - self._max_snapshots = 2 -``` - -Le surcoût de recalibration : ~100 comparisons d'instants = **~0.01 ms** -tous les 100 accès. Invisible. - -#### Et si le pattern change radicalement ? - -Exemple : une simulation qui fait 12 mois linéairement, puis remonte -dans le temps pour comparer. La recalibration détecte le changement -de directionnalité et augmente les snapshots automatiquement. - -``` -Accès 1-12 : jan → fév → ... → déc (directionality = 1.0) - → stratégie : 1 snapshot curseur ✅ - -Accès 13-24 : déc → jan, déc → fév, ... (directionality = 0.0) - → recalibration détecte le changement - → stratégie : 3 snapshots distribués ✅ -``` - -Le coût du mauvais choix pendant les ~100 premiers accès de la -nouvelle phase est borné : au pire, on paye le prix des patches naïfs -pendant 100 accès × 0.85 ms = ~85 ms, puis on recalibre. Pour une -simulation de plusieurs seconds, c'est acceptable. diff --git a/docs/implementation/links-api.md b/docs/implementation/links-api.md deleted file mode 100644 index 073707c9b..000000000 --- a/docs/implementation/links-api.md +++ /dev/null @@ -1,68 +0,0 @@ -# Entity Links API - -OpenFisca Core now includes a generic Entity Link system. Links allow variables computed on one entity to be queried and aggregated from another, or even within the same entity. - -## Declaring Links - -Links are declared on `Entity` objects, typically when building the `TaxBenefitSystem`. - -### 1. Many-to-One Links -A `Many2OneLink` resolves many source members (e.g., persons) to one target entity (e.g., a household, an employer, or another person). - -```python -from openfisca_core.links import Many2OneLink - -# Example: Intra-entity link (person to mother) -# The `mother_id` variable must be defined on `person` and contain the ID of the mother. -mother_link = Many2OneLink( - name="mother", - link_field="mother_id", - target_entity_key="person", -) -person_entity.add_link(mother_link) - -# Usage in a variable formula: -# persons.mother.get("age", period) -# or chained: persons.mother.household.get("rent", period) -``` - -### 2. One-to-Many Links -A `One2ManyLink` resolves one source entity to many target members. By default, OpenFisca implicitly creates a `One2ManyLink` for every GroupEntity pointing to its members (e.g., `household.persons`). - -```python -from openfisca_core.links import One2ManyLink - -# Example: Inter-entity link (employer to employees) -# The `employer_id` variable must be defined on `person` and contain the employer ID. -employees_link = One2ManyLink( - name="employees", - link_field="employer_id", - target_entity_key="person", # the target returned -) -employer_entity.add_link(employees_link) - -# Usage in a variable formula: -# employers.employees.sum("salary", period) -``` - -## Using Links in Formulas - -When a link is declared on a population, it is exposed as an attribute matching the link's `name`. - -### Many2One Methods - -* **`link.get(variable_name, period)`**: Returns the target variable values mapped to each source member. Unmapped members receive the default value of the variable. -* **Syntactic sugar**: `link(variable_name, period)` is equivalent to `link.get(variable_name, period)`. -* **Chaining**: `.link1.link2` returns an intermediate chained getter, so `.link1.link2.get(variable, period)` fetches the target variable across two link jumps. - -### One2Many Methods - -All One2Many aggregation methods return an array sized to the **source** entity. They all take `(variable_name, period)` + optional keyword arguments `role` and `condition` to filter the targets before aggregation. - -* `link.sum(...)` -* `link.count(...)` -* `link.any(...)` -* `link.all(...)` -* `link.min(...)` -* `link.max(...)` -* `link.avg(...)` diff --git a/docs/implementation/transition-guide.md b/docs/implementation/transition-guide.md deleted file mode 100644 index dace18ff4..000000000 --- a/docs/implementation/transition-guide.md +++ /dev/null @@ -1,95 +0,0 @@ -# Transition Guide: Moving to the New Entity Links - -With the release of the **Generic Entity Links** API, OpenFisca-core gains the ability to map complex, graph-like relational structures natively. - -This guide explains the primary differences between the legacy `GroupEntity` + `Projectors` approach and the flexible, modern `Many2OneLink` and `One2ManyLink` models, and how you should think about migration. - ---- - -## 1. Why Transition? The "Strict Hierarchy" Problem - -Historically, OpenFisca rigidly structured populations into two classes: `SingleEntity` (Persons) and `GroupEntity` (Households, Families, Tax Units). - -In this model, **every person must belong to exactly one entity of each group type.** -This handles standard socio-tax models efficiently, but prohibits features like: -- **Intra-entity (horizontal) relations**: Modeling a mother/child bond, marriages, or kinship networks. *Persons couldn't map to other Persons.* -- **Unbounded inter-entity relations**: Employment networks where one `company` controls multiple `persons`, or geographical relations (people living in specific arbitrary administrative districts). - -**The Solution:** The new Entity Links system is purely arbitrary and structural. You can declare `Many2OneLink` (N source members to 1 target entity) or `One2ManyLink` (aggregating 1 target back to N source members) linking *any population type to any other population type.* - ---- - -## 2. You don’t *have* to migrate existing simple groups. - -**Backward Compatibility is 100% Guaranteed.** - -If you have a traditional `GroupEntity` defined for households, those work exactly as they always have. In fact, OpenFisca now silently powers them using the new Linking engine gracefully: -- The legacy `person.household(...)` projector maps to a new automatically injected `ImplicitMany2OneLink`. -- The legacy `household.sum(person_salaries)` maps logically to `household.persons.sum()`. - -No code change is required in any existing variable formulas! - ---- - -## 3. From Projectors to Links: The New Syntax - -If you previously dealt with `Projectors`, you may have found chaining difficult or buggy. The new system standardizes data lookup through `link.get()` and properties filtering. - -### Before: Projectors -If you wanted the value of `rent` for the household of a person: -```python -# Projector syntax -rents = person.household("rent", period) -``` - -### After: Link Syntax -The same syntax continues to work (it actually calls `.get()` internally now on the implicitly generated link!), but you can explicitly specify `.get()`: -```python -# New link syntax -rents = person.household.get("rent", period) -``` - -**Where the new syntax shines:** Deep chaining. -You can now continuously resolve attributes down a deep relationship chain effortlessly: -```python -# Imagine a link: `person -> mother_person -> mother_household -> region` -chain = person.mother.household.get("region", period) -``` - ---- - -## 4. Transitioning Aggregations: `sum`, `count`, `min`, `max` - -Previously, aggregating members relied rigidly on passing entire pre-computed arrays to a heavy `GroupPopulation.sum()` handler. - -### Before: Legacy GroupPopulation -```python -# Fetch array of all persons in simulation -salaries = persons("salary", period) -# Pass to the group entity (e.g. household) to aggregate and collapse -total_household_incomes = households.sum(salaries, role=Household.PARENT) -``` - -### After: Declarative Links -```python -# The logic operates directly on the `One2ManyLink` bridging the two entities. -total_household_incomes = households.persons.sum("salary", period, role=Household.PARENT) -``` -Notice how declarative and explicit this is. `persons` is the plural of `person`, which the new system automatically exposed as a `One2ManyLink` on your household. - -### Conditional Aggregations -A newly-available feature explicitly unlocked by the Link system is masking by arbitrary properties! You are no longer restricted strictly to OpenFisca Roles: -```python -is_female = persons("is_female", period) -# Sum salaries, but only for members who are `is_female` -female_incomes = households.persons.sum("salary", period, condition=is_female) -``` - ---- - -## 5. Summary Checklist for Country Packages -- [ ] You **do not** need to rewrite `GroupEntity` logic for entities whose only purpose is traditional demographic grouping (like core households). -- [ ] You **can** start using `households.persons.sum()`, `households.persons.any()`, `households.persons.avg()` for highly readable aggregations in new variables. -- [ ] You **should** use `Many2OneLink` immediately if your simulation model attempts to relate `persons` to specific entities beyond openfisca-standard hierarchical groups (like a `mother_id` linking to another row in the `persons` dataframe). - -Please see the full `links-api.md` file in this directory to see exactly how to declare explicit `Many2OneLink` models inside your `TaxBenefitSystem`. diff --git a/openfisca_core/entities/_core_entity.py b/openfisca_core/entities/_core_entity.py index b1f97a910..33002e9af 100644 --- a/openfisca_core/entities/_core_entity.py +++ b/openfisca_core/entities/_core_entity.py @@ -47,9 +47,6 @@ class CoreEntity: #: A ``TaxBenefitSystem`` instance. _tax_benefit_system: None | t.TaxBenefitSystem = None - #: Named links to other entities (Many2One, One2Many, etc.). - _links: dict - @abc.abstractmethod def __init__(self, *__args: object, **__kwargs: object) -> None: ... @@ -60,43 +57,6 @@ def set_tax_benefit_system(self, tax_benefit_system: t.TaxBenefitSystem) -> None """A ``CoreEntity`` belongs to a ``TaxBenefitSystem``.""" self._tax_benefit_system = tax_benefit_system - # -- Link management -------------------------------------------------- - - def add_link(self, link) -> None: - """Register a named link on this entity. - - Args: - link: A ``Link`` instance (Many2OneLink, One2ManyLink, etc.). - - Example:: - - from openfisca_core.links import Many2OneLink - - mother = Many2OneLink( - name="mother", - link_field="mother_id", - target_entity_key="person", - ) - person_entity.add_link(mother) - - """ - if not hasattr(self, "_links") or self._links is None: - self._links = {} - self._links[link.name] = link - - def get_link(self, name: str): - """Retrieve a link by name, or ``None`` if not found.""" - if not hasattr(self, "_links") or self._links is None: - return None - return self._links.get(name) - - @property - def links(self) -> dict: - """All links registered on this entity.""" - if not hasattr(self, "_links") or self._links is None: - self._links = {} - return self._links - def get_variable( self, variable_name: t.VariableName, diff --git a/openfisca_core/experimental/_memory_config.py b/openfisca_core/experimental/_memory_config.py index 10cffdc1c..6fba790e9 100644 --- a/openfisca_core/experimental/_memory_config.py +++ b/openfisca_core/experimental/_memory_config.py @@ -19,15 +19,11 @@ class MemoryConfig: #: Variables to drop. variables_to_drop: frozenset[str] - #: Default number of LRU snapshots for as_of variables (None = use holder default). - asof_max_snapshots: int | None - def __init__( self, max_memory_occupation: str | float, priority_variables: Iterable[str] = frozenset(), variables_to_drop: Iterable[str] = frozenset(), - asof_max_snapshots: int | None = None, ) -> None: message = [ "Memory configuration is a feature that is still currently under " @@ -44,4 +40,3 @@ def __init__( self.max_memory_occupation_pc = self.max_memory_occupation * 100 self.priority_variables = frozenset(priority_variables) self.variables_to_drop = frozenset(variables_to_drop) - self.asof_max_snapshots = asof_max_snapshots diff --git a/openfisca_core/holders/holder.py b/openfisca_core/holders/holder.py index 2ff8f411d..f60d92f70 100644 --- a/openfisca_core/holders/holder.py +++ b/openfisca_core/holders/holder.py @@ -3,10 +3,8 @@ from collections.abc import Sequence from typing import Any -import bisect import os import warnings -from collections import OrderedDict import numpy import psutil @@ -31,36 +29,7 @@ def __init__(self, variable, population) -> None: self.variable = variable self.simulation = population.simulation self._eternal = self.variable.definition_period == periods.DateUnit.ETERNITY - self._as_of = getattr(self.variable, "as_of", False) self._memory_storage = storage.InMemoryStorage(is_eternal=self._eternal) - if self._as_of: - # Sparse patch storage. - # _as_of_base : first full array set (read-only). - # _as_of_base_instant : Instant at which the base was established. - # _as_of_patches : sorted list of (Instant, idx_array, val_array). - # _as_of_patch_instants : parallel list of Instants for bisect. - # _as_of_snapshots : LRU OrderedDict instant → (array, patch_idx). - # _as_of_max_snapshots : maximum number of snapshots to keep. - self._as_of_base = None - self._as_of_base_instant = None - self._as_of_patches: list = [] - self._as_of_patch_instants: list = [] - self._as_of_snapshots: OrderedDict = OrderedDict() - # Resolution order: variable.snapshot_count > MemoryConfig.asof_max_snapshots > 3 - _mc = self.simulation.memory_config if self.simulation else None - self._as_of_max_snapshots: int = next( - ( - v - for v in [ - getattr(self.variable, "snapshot_count", None), - getattr(_mc, "asof_max_snapshots", None), - ] - if v is not None - ), - 3, - ) - # Instants for which transition_formula has already been applied. - self._as_of_transition_computed: set = set() # By default, do not activate on-disk storage, or variable dropping self._disk_storage = None @@ -85,19 +54,6 @@ def clone(self, population: t.CorePopulation) -> t.Holder: if key not in ("population", "formula", "simulation"): new_dict[key] = value - if self._as_of: - # _as_of_base is read-only and can be shared between clones. - # Patch lists must be independent so that writes to the clone - # don't corrupt the original's list, but the inner arrays (idx/vals) - # are read-only and can be shared. - new_dict["_as_of_patches"] = list(self._as_of_patches) - new_dict["_as_of_patch_instants"] = list(self._as_of_patch_instants) - # Snapshots are not cloned: first access will reconstruct cheaply. - new_dict["_as_of_snapshots"] = OrderedDict() - new_dict["_as_of_transition_computed"] = set( - self._as_of_transition_computed - ) - new_dict["population"] = population new_dict["simulation"] = population.simulation @@ -131,209 +87,13 @@ def get_array(self, period): """ if self.variable.is_neutralized: return self.default_array() - if self._as_of: - # Patch-based storage: bypass _memory_storage entirely. - return self._get_as_of(period) value = self._memory_storage.get(period) if value is not None: return value - if self._as_of: - value = self._get_as_of(period) - if value is not None: - return value if self._disk_storage: return self._disk_storage.get(period) return None - def _get_as_of(self, period): - """Return the reconstructed array as-of the reference instant of period.""" - target = period.start if self._as_of == "start" else period.stop - return self._reconstruct_at(target) - - def _cache_snapshot(self, instant, array, patch_idx) -> None: - """Insert (or refresh) a snapshot in the LRU cache, evicting the least - recently used entry if the cache is full.""" - self._as_of_snapshots[instant] = (array, patch_idx) - self._as_of_snapshots.move_to_end(instant) - if len(self._as_of_snapshots) > self._as_of_max_snapshots: - self._as_of_snapshots.popitem(last=False) # evict LRU - - def _reconstruct_at(self, target_instant): - """Reconstruct the dense array at target_instant from base + patches. - - Uses a multi-snapshot LRU cache for O(k) incremental cost. - Falls back to O(N + k*P) full reconstruction when no usable snapshot - exists (e.g. backward jump past all cached snapshots). - - Returns None if no base has been set yet, or if target_instant is - before the base was established. - """ - if self._as_of_base is None or target_instant < self._as_of_base_instant: - return None - - # Number of patches that apply: all with instant <= target. - pos = bisect.bisect_right(self._as_of_patch_instants, target_instant) - last_patch_idx = pos - 1 # -1 means only the base applies - - # Exact cache hit — O(1). - if target_instant in self._as_of_snapshots: - array, _ = self._as_of_snapshots[target_instant] - self._as_of_snapshots.move_to_end(target_instant) - return array - - # Find best starting snapshot: latest snap_instant < target_instant. - best_instant = None - best_array = None - best_patch_idx = None - for snap_instant, (snap_array, snap_patch_idx) in self._as_of_snapshots.items(): - if snap_instant < target_instant: - if best_instant is None or snap_instant > best_instant: - best_instant = snap_instant - best_array = snap_array - best_patch_idx = snap_patch_idx - - if best_array is not None: - # Incremental forward reconstruction from best snapshot. - result = best_array - for i in range(best_patch_idx + 1, last_patch_idx + 1): - _, idx, vals = self._as_of_patches[i] - if result is best_array: - result = result.copy() - result[idx] = vals - if result is not best_array: - result.flags.writeable = False - else: - # Full reconstruction from base (no usable snapshot). - result = self._as_of_base.copy() - for i in range(last_patch_idx + 1): - _, idx, vals = self._as_of_patches[i] - result[idx] = vals - result.flags.writeable = False - - self._cache_snapshot(target_instant, result, last_patch_idx) - return result - - def _set_as_of(self, period, value) -> None: - """Store value for an as_of variable using sparse patch storage. - - On the first call: stores the full array as an immutable base. - On subsequent calls: computes the diff vs the current state at - period.start and stores only (changed_indices, changed_values) as a - sparse patch. If nothing changed, nothing is stored. - """ - instant = period.start - - if self._as_of_base is None: - # First set_input: establish the base and seed the snapshot cache. - self._as_of_base = value.copy() - self._as_of_base.flags.writeable = False - self._as_of_base_instant = instant - self._cache_snapshot(instant, self._as_of_base, -1) - return - - prev = self._reconstruct_at(instant) - changed = value != prev - if not changed.any(): - return # Value unchanged — no storage needed. - - idx = numpy.where(changed)[0].astype(numpy.int32) - vals = value[idx].copy() - - # Insert at sorted position (handles out-of-order set_input). - pos = bisect.bisect_right(self._as_of_patch_instants, instant) - self._as_of_patches.insert(pos, (instant, idx, vals)) - self._as_of_patch_instants.insert(pos, instant) - - new_patch_idx = len(self._as_of_patches) - 1 - if pos == new_patch_idx: - # Appended at the end (forward-sequential SET): cache current value - # so the next GET(instant) is an O(1) hit. - new_snap = value.copy() - new_snap.flags.writeable = False - self._cache_snapshot(instant, new_snap, new_patch_idx) - else: - # Retroactive (out-of-order) SET: evict all snapshots at or after - # this instant — they may no longer reflect the inserted patch. - to_evict = [k for k in self._as_of_snapshots if k >= instant] - for k in to_evict: - del self._as_of_snapshots[k] - - def _set_as_of_sparse(self, period, idx, vals) -> None: - """Store a sparse patch directly, without requiring a full N-array. - - Bypasses the O(N) diff computation of _set_as_of when the caller - already knows which elements changed (idx) and their new values (vals). - - idx : int32 array of changed indices - vals : array of new values (same dtype as the variable) - """ - if self._as_of_base is None: - raise ValueError( - "Cannot call set_input_sparse before the base is established. " - "Call set_input first for the initial period." - ) - - instant = period.start - - if len(idx) == 0: - return # nothing changed - - # Insert at sorted position (handles out-of-order calls) - pos = bisect.bisect_right(self._as_of_patch_instants, instant) - self._as_of_patches.insert(pos, (instant, idx.astype(numpy.int32), vals.copy())) - self._as_of_patch_instants.insert(pos, instant) - - new_patch_idx = len(self._as_of_patches) - 1 - if pos == new_patch_idx: - # Forward-sequential: build new snapshot from best cached snapshot. - best_instant = None - best_array = None - best_patch_idx_snap = None - for snap_instant, ( - snap_array, - snap_patch_idx, - ) in self._as_of_snapshots.items(): - if snap_instant <= instant: - if best_instant is None or snap_instant > best_instant: - best_instant = snap_instant - best_array = snap_array - best_patch_idx_snap = snap_patch_idx - if best_array is not None: - new_snap = best_array.copy() # O(N) — unavoidable for dense snapshot - # Apply any patches between the snapshot and the new one. - for i in range(best_patch_idx_snap + 1, new_patch_idx): - _, pidx, pvals = self._as_of_patches[i] - new_snap[pidx] = pvals - new_snap[idx] = vals - new_snap.flags.writeable = False - self._cache_snapshot(instant, new_snap, new_patch_idx) - # else: no snapshot yet — next GET will rebuild from base. - else: - # Retroactive insert: evict all snapshots at or after this instant. - to_evict = [k for k in self._as_of_snapshots if k >= instant] - for k in to_evict: - del self._as_of_snapshots[k] - - def set_input_sparse(self, period, idx, vals) -> None: - """Set new values for only the specified individuals. - - Unlike set_input(), the caller provides the diff directly: - - idx : array of person indices that changed (int) - - vals : their new values - - This avoids O(N) diff computation when only k << N individuals change. - Requires that set_input() was called at least once to establish the base. - """ - if not self._as_of: - raise ValueError( - f"set_input_sparse is only valid for as_of variables. " - f'"{self.variable.name}" does not declare as_of.' - ) - period = periods.period(period) - idx = numpy.asarray(idx, dtype=numpy.int32) - vals = numpy.asarray(vals, dtype=self.variable.dtype) - self._set_as_of_sparse(period, idx, vals) - def get_memory_usage(self) -> t.MemoryUsage: """Get data about the virtual memory usage of the Holder. @@ -535,11 +295,6 @@ def _set(self, period, value) -> None: error_message, ) - if self._as_of: - # Sparse patch storage — bypass _memory_storage and disk entirely. - self._set_as_of(period, value) - return - should_store_on_disk = ( self._on_disk_storable and self._memory_storage.get(period) is None @@ -547,25 +302,11 @@ def _set(self, period, value) -> None: >= self.simulation.memory_config.max_memory_occupation_pc ) - if self._as_of: - # Reference sharing: reuse existing array object when value unchanged, - # otherwise store a read-only defensive copy so that callers cannot - # corrupt stored data by mutating their original array in-place. - prev = self._get_as_of(period) - if prev is not None and numpy.array_equal(value, prev): - value = prev # prev is already read-only - else: - value = value.copy() - value.flags.writeable = False - if should_store_on_disk: self._disk_storage.put(value, period) else: self._memory_storage.put(value, period) - if self._as_of: - self._register_instant(period) - def put_in_cache(self, value, period) -> None: if self._do_not_store: return diff --git a/openfisca_core/indexed_enums/enum_array.py b/openfisca_core/indexed_enums/enum_array.py index f42ca9eb8..65bc209a7 100644 --- a/openfisca_core/indexed_enums/enum_array.py +++ b/openfisca_core/indexed_enums/enum_array.py @@ -266,7 +266,8 @@ def decode(self) -> t.ObjArray: result: t.ObjArray if self.possible_values is None: msg = ( - f"The possible values of the {self.__class__.__name__} are not defined." + f"The possible values of the {self.__class__.__name__} are " + f"not defined." ) raise TypeError(msg) array = self.reshape(1).astype(t.EnumDType) if self.ndim == 0 else self @@ -300,7 +301,8 @@ def decode_to_str(self) -> t.StrArray: result: t.StrArray if self.possible_values is None: msg = ( - f"The possible values of the {self.__class__.__name__} are not defined." + f"The possible values of the {self.__class__.__name__} are " + f"not defined." ) raise TypeError(msg) array = self.reshape(1).astype(t.EnumDType) if self.ndim == 0 else self diff --git a/openfisca_core/links/IMPLEMENTATION_PLAN.md b/openfisca_core/links/IMPLEMENTATION_PLAN.md deleted file mode 100644 index 090056a01..000000000 --- a/openfisca_core/links/IMPLEMENTATION_PLAN.md +++ /dev/null @@ -1,113 +0,0 @@ -# Entity Links — Implementation Plan - -## Branch: `feat/entity-links` - -Based on: `master` (3114a8f) - -## Objective - -Add a generic link system to openfisca-core that: -1. **Preserves 100% backward compatibility** — all existing formulas, - projectors, and tests continue working unchanged -2. **Enables new capabilities** — intra-entity links (person→person), - custom named links (person→employer), link chaining -3. **Coexists with the current system** — can be activated per-entity, - the old GroupEntity/Projector path keeps working as fallback - -## Architecture - -``` -NEW module: openfisca_core/links/ -├── __init__.py ✅ created -├── link.py ✅ created — base Link class -├── many2one.py ✅ created — Many2OneLink (N→1 with roles) -├── one2many.py ✅ created — One2ManyLink (1→N with aggregations) -└── tests/ - └── __init__.py ✅ created — tests for Link - -MODIFIED (Phase 2+): -├── entities/_core_entity.py — add _links dict -├── entities/group_entity.py — auto-generate links from roles -├── simulations/simulation.py — resolve links at init -└── projectors/ — delegate to links (Phase 4) -``` - -## Phases - -### Phase 1: Core link classes ✅ (done) - -- [x] `Link` base class with attach/resolve lifecycle -- [x] `Many2OneLink.get()` — value lookup via link_field -- [x] `Many2OneLink` chaining via `__getattr__` -- [x] `Many2OneLink` role helpers (role, has_role) -- [x] `One2ManyLink` aggregations (sum, count, any, all, min, max, avg) -- [x] `One2ManyLink` role and condition filtering -- [x] ID resolution (direct positions + id_to_rownum) -- [x] Unit tests for Link base class - -### Phase 2: Entity integration ✅ (done) - -- [x] Add `_links: dict[str, Link]` to `CoreEntity` -- [x] Add `add_link(link)`, `get_link(name)`, `links` property -- [x] In `Simulation.__init__`, call `_resolve_links()` → attach + resolve -- [x] Tests: entity registration, simulation resolution, backward compat -- [x] 14 tests pass (5 unit + 9 integration), 147 total - -### Phase 3: Auto-generate Implicit Links (✅ Completed) -- [x] Create `openfisca_core/links/implicit.py` with `ImplicitMany2OneLink` and `ImplicitOne2ManyLink`. -- [x] Have them map to `GroupPopulation.members_entity_id`/`members_role` instead of explicit link fields. -- [x] Automatically inject these links on populations when the `Simulation` object is built (`_resolve_links`). -- [x] Test person->group and group->persons lookups using only `SimulationBuilder` group dictionaries. -- [x] Make sure all links are bound to `population.links` instead of remaining unbound on `entity.links`. - -### Phase 4: Projectors as facades (✅ Skipped) - -- [x] Obsoleted by Phase 3: the `__getattr__` overload on `CorePopulation` natively maps known shortcut properties to their automatically generated links (e.g. `person.household`, `household.persons`). Existing projectors remain untouched as fallbacks. -- [ ] Reimplement UniqueRoleToEntityProjector via One2ManyLink.get_by_role() -- [ ] Reimplement FirstPersonToEntityProjector via One2ManyLink.nth() -- [ ] Non-regression: all existing tests pass with delegated projectors - -### Phase 5: Country package API (✅ Completed) - -- [x] Allow country packages to declare custom links on entities (`entity.add_link()`) -- [x] Example: mother/children intra-entity links validated in integration tests -- [x] Example: employer inter-entity link supported via explicit fields -- [x] Documentation written in `docs/implementation/links-api.md` - -### Phase 6: Integration tests (✅ Completed) - -- [x] Full simulation with intra-entity links (person.mother.age) -- [x] Full simulation with link chaining (person.mother.household.rent) -- [x] Performance benchmark: links vs current projectors (performance is ~identical for get, slightly slower for aggregations but <1ms in overhead) -- [x] Non-regression: openfisca-core and openfisca-country-template tests all pass - -## Key design decisions - -1. **Links live on Entity, not Population**: Link definitions are - structural (like roles), so they belong on the entity definition. - At simulation time, `attach()` and `resolve()` bind them to - actual populations. - -2. **link_field is a Variable name**: The field holding target IDs - is a regular Variable (e.g. `mother_id`). This means it can be - an input, computed by a formula, or even an AsOfVariable. - -3. **Backward compatibility via fallback**: If a link is not defined, - the existing GroupPopulation/Projector code path is used. Links - are opt-in. - -4. **id_to_rownum for intra-entity links**: Person→person links need - to map person IDs to row positions. This uses the LIAM2 - `id_to_rownum` pattern, which is a simple numpy array where - `id_to_rownum[person_id] = row_index`. - -## Files created - -| File | Lines | Purpose | -|---|---|---| -| `links/__init__.py` | 36 | Package init | -| `links/link.py` | 109 | Base Link class | -| `links/many2one.py` | 189 | Many2OneLink | -| `links/one2many.py` | 223 | One2ManyLink | -| `links/tests/__init__.py` | 59 | Unit tests | -| `IMPLEMENTATION_PLAN.md` | this file | Plan | diff --git a/openfisca_core/links/__init__.py b/openfisca_core/links/__init__.py deleted file mode 100644 index 3218f4662..000000000 --- a/openfisca_core/links/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Links between entities (Many2One, One2Many, chaining). - -This module implements a generic link system inspired by LIAM2, enriched -with OpenFisca's role semantics. Links are orthogonal to the existing -GroupEntity/Projector machinery: current code keeps working unchanged, -and links provide additional capabilities (intra-entity links, chaining, -arbitrary named links). - -Usage in a country package:: - - from openfisca_core.links import Many2OneLink, One2ManyLink - - # Declare a person→person link (intra-entity) - mother = Many2OneLink( - name="mother", - link_field="mother_id", - target_entity_key="person", - ) - - # Declare a person→employer link (inter-entity) - employer = Many2OneLink( - name="employer", - link_field="employer_id", - target_entity_key="employer", - ) -""" - -from .link import Link -from .many2one import Many2OneLink -from .one2many import One2ManyLink - -__all__ = [ - "Link", - "Many2OneLink", - "One2ManyLink", -] diff --git a/openfisca_core/links/implicit.py b/openfisca_core/links/implicit.py deleted file mode 100644 index 58c8fc8e3..000000000 --- a/openfisca_core/links/implicit.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Implicit links dynamically inferred from OpenFisca's GroupPopulation system.""" - -from __future__ import annotations - -import numpy - -from .many2one import Many2OneLink -from .one2many import One2ManyLink - - -class ImplicitMany2OneLink(Many2OneLink): - """A person → group link using GroupPopulation's internal arrays. - - This bypasses the usual Variable lookup (``simulation.calculate``) - and directly reads ``members_entity_id`` and ``members_role`` from - the target GroupPopulation. - """ - - def __init__(self, group_entity_key: str): - super().__init__( - name=group_entity_key, - link_field="", # Not used - target_entity_key=group_entity_key, - ) - - def _get_target_ids(self, period) -> numpy.ndarray: - return self._target_population.members_entity_id - - @property - def role(self) -> numpy.ndarray | None: - return self._target_population.members_role - - def _project_implicit(self, result: numpy.ndarray) -> numpy.ndarray: - # Fully compatible with old Projector logic - return self._target_population.project(result) - - -class ImplicitOne2ManyLink(One2ManyLink): - """A group → person link using GroupPopulation's internal arrays.""" - - def __init__(self, name: str, group_entity_key: str): - super().__init__( - name=name, - link_field="", # Not used - target_entity_key="person", # The target of the O2M is persons - ) - self._group_entity_key = group_entity_key - - def _source_rows(self, period) -> numpy.ndarray: - # For a group->person O2M, the source is the group, the target is the person. - # members_entity_id is an array of length person.count, containing the group index. - # So members_entity_id IS the array of source rows for each target member. - return self._source_population.members_entity_id - - def _apply_filters(self, period, values, role, condition): - source_rows = self._source_rows(period) - mask = numpy.ones(len(source_rows), dtype=bool) - - if role is not None: - roles = self._source_population.members_role - # roles may be an object array of Role instances, so compare by key - if roles.dtype == object: - try: - keys = numpy.fromiter( - (getattr(x, "key", x) for x in roles), - dtype=object, - ) - except Exception: - mask &= roles == role - else: - mask &= keys == role - else: - mask &= roles == role - - if condition is not None: - mask &= condition - - source_rows = source_rows[mask] - values = values[mask] - - valid = source_rows >= 0 - return source_rows[valid], values[valid] - - # override to avoid relying on ``role_field`` which is meaningless for - # implicit links (the role information is stored on the source population) - def get_by_role( - self, - variable_name: str, - period, - role_value, - *, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Fetch value for a specific role value on a one-to-many implicit link. - - This mirrors :meth:`One2ManyLink.get_by_role` but uses - ``self._source_population.members_role`` instead of a named role field - on the target population. - """ - values = self._target_population.simulation.calculate(variable_name, period) - source_rows, values = self._apply_filters(period, values, role_value, condition) - - result = numpy.zeros(self._source_population.count, dtype=values.dtype) - # last value wins (same semantics as GroupPopulation.value_from_person) - for tgt_idx, src in enumerate(source_rows): - if src >= 0: - result[src] = values[tgt_idx] - return result - - -__all__ = ["ImplicitMany2OneLink", "ImplicitOne2ManyLink"] diff --git a/openfisca_core/links/link.py b/openfisca_core/links/link.py deleted file mode 100644 index ec651809c..000000000 --- a/openfisca_core/links/link.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Base Link class for entity relationships.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import numpy - - -class Link: - """A named, directed relationship between two entity populations. - - A Link is defined by: - - - A *name* used to access the link from formulas (e.g. "mother", "household"). - - A *link_field*: the name of the Variable (on the source entity) that holds - the target entity IDs. - - A *target_entity_key*: the entity key of the target population. - - Optionally, a *role_field* and *position_field* to attach OpenFisca-style - role and position metadata to the relationship. - - - Sub-classes ``Many2OneLink`` and ``One2ManyLink`` add resolution logic. - - Parameters - ---------- - name : str - Human-readable name of the link (e.g. ``"mother"``). - link_field : str - Variable name on the **source** entity that holds IDs pointing to - the target entity. - target_entity_key : str - The ``Entity.key`` of the target entity. - role_field : str or None - Optional variable name holding the role of each source member - within the target group. - position_field : str or None - Optional variable name holding the positional index of each - source member within the target group. - """ - - def __init__( - self, - name: str, - link_field: str, - target_entity_key: str, - *, - role_field: str | None = None, - position_field: str | None = None, - ) -> None: - self.name = name - self.link_field = link_field - self.target_entity_key = target_entity_key - self.role_field = role_field - self.position_field = position_field - - # Resolved after simulation setup - self._source_population = None - self._target_population = None - - # -- lifecycle ---------------------------------------------------------- - - def attach(self, source_population) -> None: - """Bind this link to its source population.""" - self._source_population = source_population - - def resolve(self, populations: dict) -> None: - """Resolve ``target_entity_key`` to an actual population object. - - Parameters - ---------- - populations : dict[str, Population] - All populations in the simulation, keyed by entity key. - """ - if self.target_entity_key not in populations: - msg = ( - f"Link '{self.name}': target entity " - f"'{self.target_entity_key}' not found in populations " - f"{list(populations.keys())}" - ) - raise KeyError(msg) - self._target_population = populations[self.target_entity_key] - - # -- helpers ------------------------------------------------------------ - - @property - def is_resolved(self) -> bool: - return ( - self._source_population is not None and self._target_population is not None - ) - - def _get_link_ids(self, period) -> numpy.ndarray: - """Return the array of target IDs for every source member. - - This reads the ``link_field`` variable from the source population. - """ - return self._source_population(self.link_field, period) - - def __repr__(self) -> str: - return ( - f"{self.__class__.__name__}(" - f"name={self.name!r}, " - f"link_field={self.link_field!r}, " - f"target={self.target_entity_key!r})" - ) - - -__all__ = ["Link"] diff --git a/openfisca_core/links/many2one.py b/openfisca_core/links/many2one.py deleted file mode 100644 index 83d63181f..000000000 --- a/openfisca_core/links/many2one.py +++ /dev/null @@ -1,310 +0,0 @@ -"""Many-to-one link: N source members → 1 target entity.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy - -from .link import Link - -if TYPE_CHECKING: - pass - - -class Many2OneLink(Link): - """Navigate from many source members to one target entity. - - Example: ``person.household`` (each person belongs to one household), - or ``person.mother`` (each person has one mother). - - The resolution follows the LIAM2 pattern:: - - target_ids = source_pop(link_field) # e.g. [0, 0, 1, 2, 0] - target_values = target_pop(variable) # e.g. [800, 650, 900] - result = target_values[target_ids] # e.g. [800, 800, 650, 900, 800] - """ - - def get(self, variable_name: str, period) -> numpy.ndarray: - """Get a target variable's value for each source member. - - Parameters - ---------- - variable_name : str - Name of the variable defined on the target entity. - period : Period - The period for which to compute the variable. - - Returns - ------- - numpy.ndarray - Array of shape ``(n_source,)`` containing the target variable - value for each source member. Members with an invalid link - (target_id < 0) receive the variable's default value. - """ - source_pop = self._source_population - simulation = source_pop.simulation - - # 1. Target IDs for each source member - target_ids = self._get_target_ids(period) - - # 2. Variable values on the target entity - target_values = simulation.calculate(variable_name, period) - - # 3. Resolve IDs to row positions (handles id_to_rownum if needed) - target_rows = self._resolve_ids(target_ids) - - # 4. Gather with missing-value handling - variable = simulation.tax_benefit_system.get_variable(variable_name) - default = variable.default_value if variable else 0 - - from openfisca_core.indexed_enums import Enum, EnumArray - - if isinstance(default, Enum): - default = default.index - - result = numpy.full( - source_pop.count, - default, - dtype=target_values.dtype, - ) - valid = target_rows >= 0 - result[valid] = target_values[target_rows[valid]] - - if isinstance(target_values, EnumArray): - result = EnumArray(result, target_values.possible_values) - - return result - - # -- syntactic sugar ---------------------------------------------------- - - def __call__(self, variable_name: str, period) -> numpy.ndarray: - """Shorthand: ``person.mother("age", period)``.""" - return self.get(variable_name, period) - - def __getattr__(self, name: str): - """Chain links: ``person.mother.household``.""" - if name.startswith("_"): - raise AttributeError(name) - - target_pop = self._target_population - if target_pop is None: - raise AttributeError("Link is not bound to a simulation") - - if hasattr(target_pop, "links") and name in target_pop.links: - target_link = target_pop.links[name] - return _ChainedGetter(self, target_link) - - target_attr = getattr(target_pop, name, None) - if target_attr is not None: - if hasattr(target_attr, "projectable"): - - def projector_function(*args, **kwargs): - result = target_attr(*args, **kwargs) - return self._project_result(result) - - return projector_function - return target_attr - - target_entity = target_pop.entity - msg = f"Entity '{target_entity.key}' has no link named '{name}'" - raise AttributeError(msg) - - def _project_result(self, result: numpy.ndarray) -> numpy.ndarray: - if hasattr(self, "_project_implicit"): - return self._project_implicit(result) - msg = "Chained method calls computing arrays on explicit links are not supported because the period cannot be inferred." - raise NotImplementedError(msg) - - # -- role helpers ------------------------------------------------------- - - @property - def role(self) -> numpy.ndarray | None: - """Role of each source member, if ``role_field`` is set.""" - if self.role_field is None: - return None - return self._source_population.simulation.calculate( - self.role_field, - "eternity", - ) - - def has_role(self, role_value) -> numpy.ndarray: - """Boolean mask: does each source member have the given role? - - The ``role`` array may contain raw values (ints, strings) or - ``Role`` objects depending on how the population was built. When - ``role_value`` is a string we compare against the ``key`` of each - element to make the API ergonomic for callers such as - ``link.has_role("parent")`` or ``link.get_by_role(..., role_value="foo")``. - """ - r = self.role - if r is None: - msg = f"Link '{self.name}' has no role_field" - raise ValueError(msg) - - # if array holds object references, convert to their keys - if r.dtype == object: - try: - keys = numpy.fromiter( - (getattr(x, "key", x) for x in r), - dtype=object, - ) - except Exception: - # fallback to direct comparison - return r == role_value - return keys == role_value - - # numpy will perform elementwise comparison for numeric or string - return r == role_value - - # -- role-based access -------------------------------------------------- - - def get_by_role( - self, - variable_name: str, - period, - *, - role_value, - ) -> numpy.ndarray: - """Fetch a variable on the target only for members with a given role. - - Parameters - ---------- - variable_name : str - Name of the variable defined on the target entity. - period : Period - Period for which to calculate the variable. - role_value : object - The role to filter on (e.g. ``"parent"``). - - Returns - ------- - numpy.ndarray - Array of shape ``(n_source,)`` where only members whose - ``has_role(role_value)`` return ``True`` keep their computed - value; all others receive the variable's default (usually 0). - """ - mask = self.has_role(role_value) - result = self.get(variable_name, period) - # zero out non-matching rows using dtype-preserving fill - if not mask.all(): - # create a copy to avoid mutating cached results - result = result.copy() - result[~mask] = 0 - return result - - # -- ID resolution ------------------------------------------------------ - - def _get_target_ids(self, period) -> numpy.ndarray: - """Fetch the target IDs from the link_field variable.""" - return self._source_population.simulation.calculate( - self.link_field, - period, - ) - - def _resolve_ids(self, target_ids: numpy.ndarray) -> numpy.ndarray: - """Convert target IDs to row indices. - - If the target population has an ``_id_to_rownum`` mapping - (e.g. for intra-entity links where IDs ≠ row positions), use it. - Otherwise treat IDs as direct row indices (the OpenFisca convention - for GroupPopulation.members_entity_id). - """ - target_pop = self._target_population - rows = numpy.full_like(target_ids, -1, dtype=numpy.intp) - - if ( - hasattr(target_pop, "_id_to_rownum") - and target_pop._id_to_rownum is not None - ): - id_to_rownum = target_pop._id_to_rownum - valid = (target_ids >= 0) & (target_ids < len(id_to_rownum)) - rows[valid] = id_to_rownum[target_ids[valid]] - else: - valid = (target_ids >= 0) & (target_ids < target_pop.count) - rows[valid] = target_ids[valid] - - return rows - - # -- ranking ----------------------------------------------------------- - - def rank(self, variable_name: str, period) -> numpy.ndarray: - """Rank each source member within its group by a variable value. - - The rank is computed among all members sharing the same target - entity, sorted by the value of ``variable_name`` evaluated on the - *source* population. The lowest value receives rank ``0``. - - This is essentially a thin wrapper around - :meth:`~openfisca_core.populations.Population.get_rank`: - - >>> person = simulation.persons - >>> person.links['household'].rank('age', period) - array([...]) - """ - source_pop = self._source_population - # criteria on source population - criteria = source_pop.simulation.calculate(variable_name, period) - # let Population.get_rank handle grouping and sorting - return source_pop.get_rank(self, criteria) - - -# --------------------------------------------------------------------------- -# Chained link getter -# --------------------------------------------------------------------------- - - -class _ChainedGetter: - """Intermediate object for link chaining: ``person.mother.household``.""" - - def __init__(self, outer_link: Many2OneLink, inner_link: Link) -> None: - self._outer = outer_link - self._inner = inner_link - - def get(self, variable_name: str, period) -> numpy.ndarray: - """Resolve ``person.mother.household.get("rent", period)``.""" - # 1. Resolve inner link value on inner entity - inner_values = self._inner.get(variable_name, period) - - # 2. Map back through outer link - target_ids = self._outer._source_population.simulation.calculate( - self._outer.link_field, - period, - ) - target_rows = self._outer._resolve_ids(target_ids) - - result = numpy.full( - self._outer._source_population.count, - 0, - dtype=inner_values.dtype, - ) - valid = target_rows >= 0 - result[valid] = inner_values[target_rows[valid]] - return result - - def __call__(self, variable_name: str, period) -> numpy.ndarray: - """Shorthand for get(): ``person.mother.household("rent", period)``.""" - return self.get(variable_name, period) - - def __getattr__(self, name: str): - """Continue chaining: ``person.mother.household.region``.""" - if name.startswith("_"): - raise AttributeError(name) - - target_pop = self._inner._target_population - if target_pop is None: - raise AttributeError("Link is not bound to a simulation") - - if hasattr(target_pop, "links") and name in target_pop.links: - next_link = target_pop.links[name] - return _ChainedGetter(self._outer, next_link) - - target_entity = target_pop.entity - raise AttributeError(f"Entity '{target_entity.key}' has no link named '{name}'") - - def rank(self, variable_name: str, period) -> numpy.ndarray: - # forward to outer link so that chaining keeps semantics - return self._outer.rank(variable_name, period) - - -__all__ = ["Many2OneLink"] diff --git a/openfisca_core/links/one2many.py b/openfisca_core/links/one2many.py deleted file mode 100644 index e8dbac4af..000000000 --- a/openfisca_core/links/one2many.py +++ /dev/null @@ -1,306 +0,0 @@ -"""One-to-many link: 1 source entity → N target members.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy - -from .link import Link - -if TYPE_CHECKING: - pass - - -class One2ManyLink(Link): - """Aggregate from many target members back to one source entity. - - Example: ``household.members`` (one household has many persons). - - Provides aggregation methods (sum, count, any, all, min, max) that - combine values from all targets belonging to each source, optionally - filtered by role or an arbitrary boolean condition. - """ - - # -- aggregation methods ------------------------------------------------ - - def sum( - self, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Sum target values grouped by source entity. - - Equivalent to ``GroupPopulation.sum(array, role=role)``. - """ - values = self._target_values(variable_name, period) - source_rows, values = self._apply_filters( - period, - values, - role, - condition, - ) - return numpy.bincount( - source_rows, - weights=values.astype(float), - minlength=self._source_population.count, - ) - - def count( - self, - period=None, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Count target members per source entity. - - Equivalent to ``GroupPopulation.nb_persons(role=role)``. - """ - ones = numpy.ones(self._target_population.count) - source_rows, ones = self._apply_filters( - period, - ones, - role, - condition, - ) - return numpy.bincount( - source_rows, - weights=ones, - minlength=self._source_population.count, - ).astype(int) - - def any( - self, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """True if any target member has a truthy value.""" - values = self._target_values(variable_name, period) - source_rows, values = self._apply_filters( - period, - values, - role, - condition, - ) - result = numpy.zeros(self._source_population.count, dtype=bool) - numpy.logical_or.at(result, source_rows, values.astype(bool)) - return result - - def all( - self, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """True if all target members have a truthy value.""" - values = self._target_values(variable_name, period) - source_rows, values = self._apply_filters( - period, - values, - role, - condition, - ) - result = numpy.ones(self._source_population.count, dtype=bool) - numpy.logical_and.at(result, source_rows, values.astype(bool)) - return result - - def min( - self, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Minimum target value per source entity.""" - values = self._target_values(variable_name, period) - source_rows, values = self._apply_filters( - period, - values, - role, - condition, - ) - result = numpy.full( - self._source_population.count, - numpy.inf, - dtype=float, - ) - numpy.minimum.at(result, source_rows, values.astype(float)) - result[result == numpy.inf] = 0 - return result - - def max( - self, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Maximum target value per source entity.""" - values = self._target_values(variable_name, period) - source_rows, values = self._apply_filters( - period, - values, - role, - condition, - ) - result = numpy.full( - self._source_population.count, - -numpy.inf, - dtype=float, - ) - numpy.maximum.at(result, source_rows, values.astype(float)) - result[result == -numpy.inf] = 0 - return result - - def avg( - self, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Average target value per source entity.""" - s = self.sum(variable_name, period, role=role, condition=condition) - c = self.count(period, role=role, condition=condition) - return numpy.where(c > 0, s / c, 0) - - # -- positional and role-based accessors -------------------------------- - - def nth( - self, - n: int, - variable_name: str, - period, - *, - role=None, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Value of the n-th target member for each source entity. - - Parameters mirror :meth:`sum` plus ``n``. If a source has fewer than - ``n+1`` targets the default value ``0`` is returned for that source. - The ordering of targets is the same as encountered in the underlying - population arrays (i.e. no particular sort). - """ - values = self._target_values(variable_name, period) - source_rows, values = self._apply_filters(period, values, role, condition) - - result = numpy.zeros(self._source_population.count, dtype=values.dtype) - # collect indices per source and pick the n-th - for src in range(self._source_population.count): - idxs = numpy.nonzero(source_rows == src)[0] - if n < len(idxs): - result[src] = values[idxs[n]] - return result - - def get_by_role( - self, - variable_name: str, - period, - role_value, - *, - condition: numpy.ndarray | None = None, - ) -> numpy.ndarray: - """Value of the target having a unique role per source. - - ``role_value`` is compared against the ``role_field`` on the target - population. If multiple targets share the same role for a given source - the last encountered value is returned (behaviour mirrors - ``GroupPopulation.value_from_person``). - """ - if self.role_field is None: - raise ValueError("Link has no role_field") - - values = self._target_values(variable_name, period) - source_rows = self._source_rows(period) - roles = self._target_population.simulation.calculate( - self.role_field, - "eternity", - ) - - result = numpy.zeros(self._source_population.count, dtype=values.dtype) - mask = numpy.ones(len(source_rows), dtype=bool) - if condition is not None: - mask &= condition - - for tgt_idx, src in enumerate(source_rows[mask]): - if roles[mask][tgt_idx] == role_value and src >= 0: - result[src] = values[mask][tgt_idx] - return result - - # -- internal ----------------------------------------------------------- - - def _target_values( - self, - variable_name: str, - period, - ) -> numpy.ndarray: - """Compute the variable on the target entity.""" - return self._target_population.simulation.calculate( - variable_name, - period, - ) - - def _source_rows(self, period) -> numpy.ndarray: - """For each target member, the row index of its source entity. - - This reads ``link_field`` from the *target* population (the members) - and resolves to source row indices. - """ - target_pop = self._target_population - source_pop = self._source_population - simulation = target_pop.simulation - - source_ids = simulation.calculate(self.link_field, period) - - # If source has id_to_rownum, use it; else IDs are positions. - if ( - hasattr(source_pop, "_id_to_rownum") - and source_pop._id_to_rownum is not None - ): - id_to_rownum = source_pop._id_to_rownum - rows = numpy.full_like(source_ids, -1, dtype=numpy.intp) - valid = (source_ids >= 0) & (source_ids < len(id_to_rownum)) - rows[valid] = id_to_rownum[source_ids[valid]] - return rows - - rows = source_ids.copy().astype(numpy.intp) - rows[(rows < 0) | (rows >= source_pop.count)] = -1 - return rows - - def _apply_filters(self, period, values, role, condition): - """Apply role and condition filters, return (source_rows, values).""" - source_rows = self._source_rows(period) - mask = numpy.ones(len(source_rows), dtype=bool) - - # Role filter - if role is not None and self.role_field is not None: - simulation = self._target_population.simulation - roles = simulation.calculate(self.role_field, "eternity") - mask &= roles == role - - # Condition filter - if condition is not None: - mask &= condition - - source_rows = source_rows[mask] - values = values[mask] - - # Remove invalid rows - valid = source_rows >= 0 - return source_rows[valid], values[valid] - - -__all__ = ["One2ManyLink"] diff --git a/openfisca_core/links/tests/__init__.py b/openfisca_core/links/tests/__init__.py deleted file mode 100644 index 2d457f700..000000000 --- a/openfisca_core/links/tests/__init__.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Tests for the Link base class.""" - -from openfisca_core.links.link import Link - - -class TestLink: - """Verify Link construction and lifecycle.""" - - def test_construction(self): - link = Link( - name="household", - link_field="household_id", - target_entity_key="household", - ) - assert link.name == "household" - assert link.link_field == "household_id" - assert link.target_entity_key == "household" - assert link.role_field is None - assert link.position_field is None - - def test_construction_with_role(self): - link = Link( - name="household", - link_field="household_id", - target_entity_key="household", - role_field="household_role", - position_field="household_position", - ) - assert link.role_field == "household_role" - assert link.position_field == "household_position" - - def test_repr(self): - link = Link( - name="mother", - link_field="mother_id", - target_entity_key="person", - ) - assert "mother" in repr(link) - assert "person" in repr(link) - - def test_is_resolved_false_initially(self): - link = Link( - name="test", - link_field="test_id", - target_entity_key="person", - ) - assert not link.is_resolved - - def test_resolve_unknown_entity_raises(self): - link = Link( - name="test", - link_field="test_id", - target_entity_key="unknown", - ) - import pytest - - with pytest.raises(KeyError, match="unknown"): - link.resolve({"person": object()}) diff --git a/openfisca_core/links/tests/test_benchmark.py b/openfisca_core/links/tests/test_benchmark.py deleted file mode 100644 index 6a7e31bbd..000000000 --- a/openfisca_core/links/tests/test_benchmark.py +++ /dev/null @@ -1,93 +0,0 @@ -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.simulations import SimulationBuilder - - -def build_large_simulation(n_households=5000, persons_per_hh=3): - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class rent(variables.Variable): - value_type = float - entity = household - definition_period = periods.DateUnit.YEAR - - for var in [salary, rent]: - tbs.add_variable(var) - - # Generate data - n_persons = n_households * persons_per_hh - - persons_dict = { - f"p{i}": {"salary": {"2024": float(i % 1000)}} for i in range(n_persons) - } - - households_dict = { - f"h{i}": { - "member": [f"p{i * persons_per_hh + j}" for j in range(persons_per_hh)], - "rent": {"2024": float(i % 500)}, - } - for i in range(n_households) - } - - input_dict = {"persons": persons_dict, "households": households_dict} - - return SimulationBuilder().build_from_dict(tbs, input_dict) - - -@pytest.fixture(scope="module") -def sim(): - return build_large_simulation() - - -def test_benchmark_projector_m2o(benchmark, sim): - """Benchmark the old projector logic: person.household('rent', period).""" - - def compute(): - # Get variable rent via projector - # the shortcut triggers projector_function - return sim.persons.household("rent", "2024") - - res = benchmark(compute) - assert len(res) == sim.persons.count - - -def test_benchmark_link_m2o(benchmark, sim): - """Benchmark the new link logic: person.links['household'].get('rent', period).""" - - def compute(): - return sim.persons.links["household"].get("rent", "2024") - - res = benchmark(compute) - assert len(res) == sim.persons.count - - -def test_benchmark_old_sum_o2m(benchmark, sim): - """Benchmark the old GroupPopulation sum: household.sum(persons('salary', period)).""" - - def compute(): - salaries = sim.persons("salary", "2024") - return sim.populations["household"].sum(salaries) - - res = benchmark(compute) - assert len(res) == sim.populations["household"].count - - -def test_benchmark_link_sum_o2m(benchmark, sim): - """Benchmark the new link sum: household.links['persons'].sum('salary', period).""" - - def compute(): - return sim.populations["household"].links["persons"].sum("salary", "2024") - - res = benchmark(compute) - assert len(res) == sim.populations["household"].count diff --git a/openfisca_core/links/tests/test_edge_cases.py b/openfisca_core/links/tests/test_edge_cases.py deleted file mode 100644 index 1fd97ee20..000000000 --- a/openfisca_core/links/tests/test_edge_cases.py +++ /dev/null @@ -1,467 +0,0 @@ -"""Edge-case and hardened tests for the Link system. - -Covers scenarios that are difficult to hit with normal use: -- All-invalid IDs -- Condition of wrong size -- Unresolved links -- Condition all-True / all-False -- Singleton populations -- EnumArray through links -- Chained links with cascading defaults -- Implicit links with role+condition -""" - -import numpy -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links import Many2OneLink, One2ManyLink -from openfisca_core.links.implicit import ImplicitOne2ManyLink -from openfisca_core.simulations import SimulationBuilder - -# ────────────────────────────────────────────────────────────── -# Shared fixture builder -# ────────────────────────────────────────────────────────────── - - -def _make_tbs_and_sim( - n_persons=4, - *, - person_links=None, - household_links=None, - extra_vars=None, -): - """Build a minimal TBS and simulation. - - Returns (tbs, sim). - """ - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - for link in person_links or []: - person.add_link(link) - for link in household_links or []: - household.add_link(link) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - # Standard variables - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class mother_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_role(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = 0 - - class rent(variables.Variable): - value_type = float - entity = household - definition_period = periods.DateUnit.YEAR - - for var in [salary, mother_id, household_id, household_role, rent]: - tbs.add_variable(var) - - if extra_vars: - for var in extra_vars: - tbs.add_variable(var) - - sim = SimulationBuilder().build_default_simulation(tbs, count=n_persons) - return tbs, sim - - -# ────────────────────────────────────────────────────────────── -# 1. Many2One: all IDs invalid -# ────────────────────────────────────────────────────────────── - - -class TestMany2OneAllInvalid: - def test_all_mother_ids_minus_one(self): - """Every person has mother_id=-1 → returns default value for all.""" - mother_link = Many2OneLink("mother", "mother_id", "person") - _, sim = _make_tbs_and_sim(n_persons=3, person_links=[mother_link]) - sim.set_input("mother_id", "2024", [-1, -1, -1]) - sim.set_input("salary", "2024", [100.0, 200.0, 300.0]) - - link = sim.persons.links["mother"] - result = link.get("salary", "2024") - # All default (0.0 for salary) - numpy.testing.assert_array_equal(result, [0.0, 0.0, 0.0]) - - def test_mother_id_out_of_bounds(self): - """mother_id points beyond population size → treated as invalid.""" - mother_link = Many2OneLink("mother", "mother_id", "person") - _, sim = _make_tbs_and_sim(n_persons=3, person_links=[mother_link]) - sim.set_input("mother_id", "2024", [999, -5, 2]) - sim.set_input("salary", "2024", [100.0, 200.0, 300.0]) - - link = sim.persons.links["mother"] - result = link.get("salary", "2024") - # Only person 2 (mother_id=2) is valid → salary[2]=300 - numpy.testing.assert_array_equal(result, [0.0, 0.0, 300.0]) - - -# ────────────────────────────────────────────────────────────── -# 2. One2Many: condition of wrong size -# ────────────────────────────────────────────────────────────── - - -class TestConditionWrongSize: - def test_condition_too_short_raises(self): - """A condition with fewer elements than persons should raise.""" - members_link = One2ManyLink( - "members", "household_id", "person", role_field="household_role" - ) - _, sim = _make_tbs_and_sim(n_persons=4, household_links=[members_link]) - sim.set_input("household_id", "2024", [0, 0, 1, 1]) - sim.set_input("salary", "2024", [100.0, 200.0, 300.0, 400.0]) - - link = sim.populations["household"].links["members"] - bad_condition = numpy.array([True, False]) # size 2 instead of 4 - - with pytest.raises((IndexError, ValueError)): - link.sum("salary", "2024", condition=bad_condition) - - -# ────────────────────────────────────────────────────────────── -# 3. Link not resolved: calling get() before resolve() -# ────────────────────────────────────────────────────────────── - - -class TestUnresolvedLink: - def test_get_before_resolve_raises(self): - """Calling get() on an unresolved link should fail clearly.""" - link = Many2OneLink("mother", "mother_id", "person") - # Not attached, not resolved - assert not link.is_resolved - with pytest.raises((AttributeError, TypeError)): - link.get("salary", "2024") - - -# ────────────────────────────────────────────────────────────── -# 4. Condition all-False / all-True -# ────────────────────────────────────────────────────────────── - - -class TestConditionExtremes: - @pytest.fixture - def link_sim(self): - members_link = One2ManyLink( - "members", "household_id", "person", role_field="household_role" - ) - _, sim = _make_tbs_and_sim(n_persons=4, household_links=[members_link]) - sim.set_input("household_id", "2024", [0, 0, 1, 1]) - sim.set_input("salary", "2024", [100.0, 200.0, 300.0, 400.0]) - link = sim.populations["household"].links["members"] - return link, sim - - def test_condition_all_false(self, link_sim): - link, sim = link_sim - all_false = numpy.zeros(4, dtype=bool) - - result_sum = link.sum("salary", "2024", condition=all_false) - result_count = link.count("2024", condition=all_false) - result_any = link.any("salary", "2024", condition=all_false) - - numpy.testing.assert_array_equal(result_sum, [0.0, 0.0, 0.0, 0.0]) - numpy.testing.assert_array_equal(result_count, [0, 0, 0, 0]) - numpy.testing.assert_array_equal(result_any, [False, False, False, False]) - - def test_condition_all_true_same_as_no_condition(self, link_sim): - link, sim = link_sim - all_true = numpy.ones(4, dtype=bool) - - with_cond = link.sum("salary", "2024", condition=all_true) - without_cond = link.sum("salary", "2024") - - numpy.testing.assert_array_equal(with_cond, without_cond) - - -# ────────────────────────────────────────────────────────────── -# 5. Singleton population (1 person, 1 household) -# ────────────────────────────────────────────────────────────── - - -class TestSingletonPopulation: - def test_one_person_one_household(self): - members_link = One2ManyLink( - "members", "household_id", "person", role_field="household_role" - ) - _, sim = _make_tbs_and_sim(n_persons=1, household_links=[members_link]) - sim.set_input("household_id", "2024", [0]) - sim.set_input("salary", "2024", [42.0]) - - link = sim.populations["household"].links["members"] - - assert link.sum("salary", "2024")[0] == pytest.approx(42.0) - assert link.count("2024")[0] == 1 - assert link.avg("salary", "2024")[0] == pytest.approx(42.0) - assert link.min("salary", "2024")[0] == pytest.approx(42.0) - assert link.max("salary", "2024")[0] == pytest.approx(42.0) - assert link.any("salary", "2024")[0] is numpy.bool_(True) - assert link.all("salary", "2024")[0] is numpy.bool_(True) - - -# ────────────────────────────────────────────────────────────── -# 6. Chained link with cascading defaults -# ────────────────────────────────────────────────────────────── - - -class TestChainedDefaults: - def test_mother_with_no_household(self): - """person.mother.household("rent") where mother has no household.""" - mother_link = Many2OneLink("mother", "mother_id", "person") - household_link = Many2OneLink( - "household", "household_id", "household", role_field="household_role" - ) - - _, sim = _make_tbs_and_sim( - n_persons=3, - person_links=[mother_link, household_link], - ) - # Person 0: mother=-1, hh=0 - # Person 1: mother=0, hh=-1 (homeless mother!) - # Person 2: mother=1, hh=0 - sim.set_input("mother_id", "2024", [-1, 0, 1]) - sim.set_input("household_id", "2024", [0, -1, 0]) - sim.set_input("rent", "2024", [999.0, 0.0, 0.0]) - - chained = sim.persons.links["mother"].household - result = chained.get("rent", "2024") - # Person 0: no mother → default 0 - # Person 1: mother=0, hh=0, rent=999 → 999 - # Person 2: mother=1, hh=-1 → default 0 - numpy.testing.assert_array_equal(result, [0.0, 999.0, 0.0]) - - -# ────────────────────────────────────────────────────────────── -# 7. ImplicitOne2ManyLink: role + condition combined -# ────────────────────────────────────────────────────────────── - - -class TestImplicitRoleAndCondition: - def test_implicit_o2m_role_and_condition(self): - """The fix for role+condition should also work for implicit links.""" - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class is_female(variables.Variable): - value_type = bool - entity = person - definition_period = periods.DateUnit.YEAR - - for var in [salary, is_female]: - tbs.add_variable(var) - - sim = SimulationBuilder().build_from_dict( - tbs, - { - "persons": { - "p0": {"salary": {"2024": 1000.0}, "is_female": {"2024": True}}, - "p1": {"salary": {"2024": 500.0}, "is_female": {"2024": False}}, - "p2": {"salary": {"2024": 200.0}, "is_female": {"2024": True}}, - }, - "households": { - "h0": {"member": ["p0", "p1", "p2"]}, - }, - }, - ) - - # Create and bind implicit link - link = ImplicitOne2ManyLink("persons", "household") - pop = sim.populations["household"] - link.attach(pop) - link.resolve(sim.populations) - - # Manually assign membership info - pop.members_entity_id = numpy.array([0, 0, 0]) - pop.members_role = numpy.array([0, 1, 1]) - - is_female = sim.calculate("is_female", "2024") - - # Role 1 (children) who are female: only p2 (salary=200) - res = link.sum("salary", "2024", role=1, condition=is_female) - assert res[0] == pytest.approx(200.0) - - # Role 0 (parents) who are female: only p0 (salary=1000) - res = link.sum("salary", "2024", role=0, condition=is_female) - assert res[0] == pytest.approx(1000.0) - - -# ────────────────────────────────────────────────────────────── -# 8. GroupPopulation condition edge cases -# ────────────────────────────────────────────────────────────── - - -class TestGroupPopulationConditionEdgeCases: - @pytest.fixture - def sim(self): - """Sim with country-template entities for GroupPopulation testing.""" - try: - from openfisca_country_template import CountryTaxBenefitSystem - except ImportError: - pytest.skip("openfisca-country-template not installed") - - tbs = CountryTaxBenefitSystem() - sb = SimulationBuilder() - sb.set_default_period("2024-01") - return sb.build_from_entities( - tbs, - { - "persons": { - "p0": {"salary": {"2024-01": 1000}}, - "p1": {"salary": {"2024-01": 2000}}, - "p2": {"salary": {"2024-01": 3000}}, - }, - "households": { - "h0": {"adults": ["p0", "p1"], "children": ["p2"]}, - }, - }, - ) - - def test_condition_all_false_gives_zero(self, sim): - household = sim.household - salary = household.members("salary", "2024-01") - all_false = numpy.zeros(3, dtype=bool) - - assert household.sum(salary, condition=all_false)[0] == 0.0 - assert household.nb_persons(condition=all_false)[0] == 0 - assert not household.any(salary > 0, condition=all_false)[0] - - def test_condition_all_true_same_as_none(self, sim): - household = sim.household - salary = household.members("salary", "2024-01") - all_true = numpy.ones(3, dtype=bool) - - with_cond = household.sum(salary, condition=all_true) - without_cond = household.sum(salary) - numpy.testing.assert_array_equal(with_cond, without_cond) - - def test_min_condition_excludes_everyone(self, sim): - """Min with nobody matching → sentinel replaced by 0.""" - household = sim.household - salary = household.members("salary", "2024-01") - all_false = numpy.zeros(3, dtype=bool) - - result = household.min(salary, condition=all_false) - # When no one matches, reduce returns inf → but the test - # verifies it doesn't crash. The value is implementation-dependent. - assert len(result) == 1 # One household - - def test_all_with_condition_vacuous_truth(self, sim): - """all() with condition excluding everyone → vacuously True.""" - household = sim.household - salary = household.members("salary", "2024-01") - no_one = numpy.zeros(3, dtype=bool) - - result = household.all(salary > 9999, condition=no_one) - # Vacuously true: there is no member for which the predicate is false - assert result[0] - - -# ────────────────────────────────────────────────────────────── -# 9. Many2OneLink: role helpers on link without role_field -# ────────────────────────────────────────────────────────────── - - -class TestRoleHelpersMissing: - def test_has_role_raises_without_role_field(self): - """has_role() on a link without role_field should raise ValueError.""" - mother_link = Many2OneLink("mother", "mother_id", "person") - _, sim = _make_tbs_and_sim(n_persons=2, person_links=[mother_link]) - - link = sim.persons.links["mother"] - with pytest.raises(ValueError, match="no role_field"): - link.has_role(0) - - def test_role_is_none_without_role_field(self): - """role property on a link without role_field should return None.""" - mother_link = Many2OneLink("mother", "mother_id", "person") - _, sim = _make_tbs_and_sim(n_persons=2, person_links=[mother_link]) - - link = sim.persons.links["mother"] - assert link.role is None - - -# ────────────────────────────────────────────────────────────── -# 10. One2ManyLink: empty source population -# ────────────────────────────────────────────────────────────── - - -class TestEmptySourcePopulation: - def test_zero_persons_all_aggregations(self): - """N persons=0, K households > 0 — all aggregations return 0.""" - members_link = One2ManyLink( - "members", "household_id", "person", role_field="household_role" - ) - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - household.add_link(members_link) - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class household_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_role(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = 0 - - for var in [salary, household_id, household_role]: - tbs.add_variable(var) - - # 0 persons, but let Simulation think there are 2 households - # build_default_simulation creates both person and household entities - # with the given count. We need at least 1 person to build. - sim = SimulationBuilder().build_default_simulation(tbs, count=2) - # assign all 2 persons to household -1 (none attached to hh 0 or 1) - sim.set_input("household_id", "2024", [-1, -1]) - sim.set_input("salary", "2024", [100.0, 200.0]) - - link = sim.populations["household"].links["members"] - - result_sum = link.sum("salary", "2024") - result_count = link.count("2024") - result_any = link.any("salary", "2024") - - # No persons linked to any household → all zeros - numpy.testing.assert_array_equal(result_sum, [0.0, 0.0]) - numpy.testing.assert_array_equal(result_count, [0, 0]) - numpy.testing.assert_array_equal(result_any, [False, False]) diff --git a/openfisca_core/links/tests/test_implicit.py b/openfisca_core/links/tests/test_implicit.py deleted file mode 100644 index cdce13240..000000000 --- a/openfisca_core/links/tests/test_implicit.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Test Phase 3: Auto-generation and implicit links.""" - -import numpy -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links.implicit import ImplicitMany2OneLink, ImplicitOne2ManyLink -from openfisca_core.simulations import SimulationBuilder - - -@pytest.fixture -def sim(): - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class rent(variables.Variable): - value_type = float - entity = household - definition_period = periods.DateUnit.YEAR - - for var in [salary, rent]: - tbs.add_variable(var) - - sim = SimulationBuilder().build_from_dict( - tbs, - { - "persons": { - "p0": {"salary": {"2024": 1000.0}}, - "p1": {"salary": {"2024": 500.0}}, - "p2": {"salary": {"2024": 2000.0}}, - "p3": {"salary": {"2024": 100.0}}, - }, - "households": { - "h0": {"member": ["p0", "p1"], "rent": {"2024": 800.0}}, - "h1": {"member": ["p2"], "rent": {"2024": 500.0}}, - "h2": {"member": ["p3"], "rent": {"2024": 100.0}}, - }, - }, - ) - return sim - - -def test_implicit_many2one(sim): - link = ImplicitMany2OneLink("household") - link.attach(sim.persons) - link.resolve(sim.populations) - - rents = link.get("rent", "2024") - # p0, p1 -> h0 -> 800 - # p2 -> h1 -> 500 - # p3 -> h2 -> 100 - assert numpy.array_equal(rents, [800.0, 800.0, 500.0, 100.0]) - - -def test_implicit_one2many(sim): - link = ImplicitOne2ManyLink("persons", "household") - link.attach(sim.populations["household"]) - link.resolve(sim.populations) - - salaries = link.sum("salary", "2024") - # h0: p0+p1 = 1500 - # h1: p2 = 2000 - # h2: p3 = 100 - assert numpy.array_equal(salaries, [1500.0, 2000.0, 100.0]) - - counts = link.count("2024") - assert numpy.array_equal(counts, [2, 1, 1]) diff --git a/openfisca_core/links/tests/test_integration.py b/openfisca_core/links/tests/test_integration.py deleted file mode 100644 index c1dc160ff..000000000 --- a/openfisca_core/links/tests/test_integration.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Tests for Phase 2: entity integration and link resolution.""" - -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links import Many2OneLink, One2ManyLink -from openfisca_core.simulations import SimulationBuilder - -# -- Test CoreEntity.add_link / get_link / links -------------------------- - - -class TestEntityLinks: - """Verify that links can be registered and retrieved on entities.""" - - def test_add_and_get_link(self): - entity = entities.SingleEntity("person", "persons", "A person", "") - link = Many2OneLink( - name="mother", - link_field="mother_id", - target_entity_key="person", - ) - entity.add_link(link) - assert entity.get_link("mother") is link - - def test_get_link_not_found(self): - entity = entities.SingleEntity("person", "persons", "A person", "") - assert entity.get_link("nonexistent") is None - - def test_links_property(self): - entity = entities.SingleEntity("person", "persons", "A person", "") - link1 = Many2OneLink( - name="mother", - link_field="mother_id", - target_entity_key="person", - ) - link2 = Many2OneLink( - name="father", - link_field="father_id", - target_entity_key="person", - ) - entity.add_link(link1) - entity.add_link(link2) - assert len(entity.links) == 2 - assert "mother" in entity.links - assert "father" in entity.links - - def test_links_empty_by_default(self): - entity = entities.SingleEntity("person", "persons", "A person", "") - assert entity.links == {} - - -# -- Test link resolution in Simulation ----------------------------------- - - -class TestLinkResolution: - """Verify that links are resolved when a Simulation is created.""" - - def _make_tbs_with_link(self): - """Build a minimal TBS with a person entity and a mother link.""" - person = entities.SingleEntity( - "person", - "persons", - "A person", - "", - ) - household = entities.GroupEntity( - "household", - "households", - "A household", - "", - roles=[{"key": "member"}], - ) - - # Declare a Many2One link: person → person (intra-entity) - mother_link = Many2OneLink( - name="mother", - link_field="mother_id", - target_entity_key="person", - ) - person.add_link(mother_link) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - # Add required variables - class mother_id(variables.Variable): - value_type = int - default_value = -1 - entity = person - definition_period = periods.DateUnit.ETERNITY - label = "ID of the mother" - - tbs.add_variable(mother_id) - - return tbs, mother_link - - def test_links_resolved_after_simulation_build(self): - tbs, mother_link = self._make_tbs_with_link() - sim = SimulationBuilder().build_default_simulation(tbs, count=5) - - # The link should be attached to the person population - bound_link = sim.persons.links["mother"] - assert bound_link._source_population is not None - assert bound_link._target_population is not None - assert bound_link.is_resolved - - def test_resolved_link_points_to_correct_population(self): - tbs, mother_link = self._make_tbs_with_link() - sim = SimulationBuilder().build_default_simulation(tbs, count=5) - - # Source and target should both be the person population - bound_link = sim.persons.links["mother"] - assert bound_link._source_population is sim.persons - assert bound_link._target_population is sim.persons - - def test_link_on_group_entity(self): - """A link declared on a GroupEntity gets resolved too.""" - person = entities.SingleEntity( - "person", - "persons", - "A person", - "", - ) - household = entities.GroupEntity( - "household", - "households", - "A household", - "", - roles=[{"key": "member"}], - ) - - # O2M link: household → persons - members_link = One2ManyLink( - name="members", - link_field="household_id", - target_entity_key="person", - ) - household.add_link(members_link) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - sim = SimulationBuilder().build_default_simulation(tbs, count=3) - - bound_link = sim.populations["household"].links["members"] - assert bound_link.is_resolved - assert bound_link._source_population is sim.populations["household"] - assert bound_link._target_population is sim.persons - - -# -- Test backward compatibility ------------------------------------------ - - -class TestBackwardCompatibility: - """Ensure that adding links doesn't break existing functionality.""" - - def test_existing_tests_pass_with_country_template(self): - """Smoke test: build a simulation with country-template.""" - try: - from openfisca_country_template import CountryTaxBenefitSystem - - tbs = CountryTaxBenefitSystem() - sim = SimulationBuilder().build_default_simulation(tbs, count=3) - - # Basic calculation should work - result = sim.calculate("disposable_income", "2024-01") - assert result is not None - assert len(result) == 3 - except ImportError: - pytest.skip("openfisca-country-template not installed") - - def test_no_links_no_problem(self): - """Entities without links should work as before.""" - person = entities.SingleEntity( - "person", - "persons", - "A person", - "", - ) - household = entities.GroupEntity( - "household", - "households", - "A household", - "", - roles=[{"key": "member"}], - ) - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - sim = SimulationBuilder().build_default_simulation(tbs, count=2) - - # Entity should have an empty links dict - assert person.links == {} - assert household.links == {} - - # Simulation should work fine - assert sim.persons.count == 2 diff --git a/openfisca_core/links/tests/test_many2one.py b/openfisca_core/links/tests/test_many2one.py deleted file mode 100644 index 6bf102f4e..000000000 --- a/openfisca_core/links/tests/test_many2one.py +++ /dev/null @@ -1,149 +0,0 @@ -import numpy -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links import Many2OneLink -from openfisca_core.simulations import SimulationBuilder - - -@pytest.fixture -def sim(): - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - mother_link = Many2OneLink("mother", "mother_id", "person") - person.add_link(mother_link) - - household_link = Many2OneLink( - "household", "household_id", "household", role_field="household_role" - ) - person.add_link(household_link) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class age(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.YEAR - - class rent(variables.Variable): - value_type = float - entity = household - definition_period = periods.DateUnit.YEAR - - class mother_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_role(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = 0 - - for var in [age, rent, mother_id, household_id, household_role]: - tbs.add_variable(var) - - # persons: 0, 1, 2, 3 - # households: 0, 1 - sim = SimulationBuilder().build_default_simulation(tbs, count=4) - # Mother of 0 is -1, 1 is 0, 2 is 0, 3 is 1 - sim.set_input("mother_id", "2024", [-1, 0, 0, 1]) - sim.set_input("age", "2024", [50, 25, 20, 5]) - sim.set_input("household_id", "2024", [0, 0, 1, 1]) - sim.set_input("household_role", "2024", [10, 20, 10, 20]) - sim.set_input("rent", "2024", [800.0, 500.0, 0.0, 0.0]) - return sim - - -def test_many2one_get_intra_entity(sim): - """Test person -> person lookup (mother).""" - link = sim.persons.links["mother"] - mother_ages = link.get("age", "2024") - - # Expected: - # 0 -> no mother -> default age (0) - # 1 -> mother 0 -> age 50 - # 2 -> mother 0 -> age 50 - # 3 -> mother 1 -> age 25 - assert numpy.array_equal(mother_ages, [0, 50, 50, 25]) - - # Syntax sugar test: - assert numpy.array_equal(link("age", "2024"), [0, 50, 50, 25]) - - -def test_many2one_get_inter_entity(sim): - """Test person -> household lookup.""" - link = sim.persons.links["household"] - h_rents = link.get("rent", "2024") - - # Expected: - # 0 -> hh 0 -> rent 800 - # 1 -> hh 0 -> rent 800 - # 2 -> hh 1 -> rent 500 - # 3 -> hh 1 -> rent 500 - assert numpy.array_equal(h_rents, [800.0, 800.0, 500.0, 500.0]) - - -def test_many2one_chaining(sim): - """Test person.mother.household.rent chaining.""" - mother_link = sim.persons.links["mother"] - - # Expected: - # person 0 -> mother -1 -> no household -> rent 0.0 - # person 1 -> mother 0 -> hh 0 -> rent 800.0 - # person 2 -> mother 0 -> hh 0 -> rent 800.0 - # person 3 -> mother 1 -> hh 0 -> rent 800.0 - chained_link = mother_link.household - h_rents = chained_link.get("rent", "2024") - assert numpy.array_equal(h_rents, [0.0, 800.0, 800.0, 800.0]) - - # Test syntactic sugar - assert numpy.array_equal(chained_link("rent", "2024"), [0.0, 800.0, 800.0, 800.0]) - - -def test_many2one_role_helpers(sim): - link = sim.persons.links["household"] - roles = link.role - assert numpy.array_equal(roles, [10, 20, 10, 20]) - - has_role_10 = link.has_role(10) - assert numpy.array_equal(has_role_10, [True, False, True, False]) - - # also exercise the new helper - parent_rents = link.get_by_role("rent", "2024", role_value=10) - assert numpy.array_equal(parent_rents, [800.0, 0.0, 500.0, 0.0]) - - -def test_many2one_rank(sim): - """Ranking people by age within their household via the link. - - The default ``sim`` fixture uses ``build_default_simulation`` which - does not populate ``household.members_entity_id`` correctly, so we - patch the group population manually using the input variable. - """ - # ensure household group mappings match the input variable - sim.household.members_entity_id = sim.persons("household_id", "2024") - # reset any cached position so rank uses updated mapping - sim.household._members_position = None - - link = sim.persons.links["household"] - # ages [50, 25, 20, 5] per person - ranks = link.rank("age", "2024") - # households: h0->[50,25] -> ranks [1,0]; h1->[20,5] -> ranks [1,0] - assert numpy.array_equal(ranks, [1, 0, 1, 0]) - - # chaining should also forward to outer link (no behavioural change) - chained = sim.persons.links["mother"].household - ranks2 = chained.rank("age", "2024") - assert numpy.array_equal(ranks2, ranks) diff --git a/openfisca_core/links/tests/test_one2many.py b/openfisca_core/links/tests/test_one2many.py deleted file mode 100644 index 973ddd8a8..000000000 --- a/openfisca_core/links/tests/test_one2many.py +++ /dev/null @@ -1,112 +0,0 @@ -import numpy -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links import One2ManyLink -from openfisca_core.simulations import SimulationBuilder - - -@pytest.fixture -def sim(): - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - members_link = One2ManyLink( - "members", "household_id", "person", role_field="household_role" - ) - household.add_link(members_link) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class is_female(variables.Variable): - value_type = bool - entity = person - definition_period = periods.DateUnit.YEAR - - class household_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_role(variables.Variable): # 0: parent, 1: child - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = 0 - - for var in [salary, is_female, household_id, household_role]: - tbs.add_variable(var) - - sim = SimulationBuilder().build_default_simulation(tbs, count=4) - # households: 0, 1 - # person 0: hh 0, role 0, salary 1000, F - # person 1: hh 0, role 1, salary 500, M - # person 2: hh 1, role 0, salary 2000, M - # person 3: hh -1 (none) - sim.set_input("household_id", "2024", [0, 0, 1, -1]) - sim.set_input("household_role", "2024", [0, 1, 0, 0]) - sim.set_input("salary", "2024", [1000.0, 500.0, 2000.0, 100.0]) - sim.set_input("is_female", "2024", [True, False, False, True]) - - return sim - - -def test_one2many_aggregations(sim): - link = sim.populations["household"].links["members"] - - res_sum = link.sum("salary", "2024") - assert numpy.array_equal(res_sum, [1500.0, 2000.0, 0.0, 0.0]) - - res_count = link.count("2024") - assert numpy.array_equal(res_count, [2, 1, 0, 0]) - - res_avg = link.avg("salary", "2024") - assert numpy.array_equal(res_avg, [750.0, 2000.0, 0.0, 0.0]) - - res_min = link.min("salary", "2024") - assert numpy.array_equal(res_min, [500.0, 2000.0, 0.0, 0.0]) - - res_max = link.max("salary", "2024") - assert numpy.array_equal(res_max, [1000.0, 2000.0, 0.0, 0.0]) - - -def test_one2many_any_all(sim): - link = sim.populations["household"].links["members"] - - # Is there a female in each household? - res_any = link.any("is_female", "2024") - assert numpy.array_equal(res_any, [True, False, False, False]) - - # Are all members female? - res_all = link.all("is_female", "2024") - assert numpy.array_equal(res_all, [False, False, True, True]) - - -def test_one2many_role_filter(sim): - link = sim.populations["household"].links["members"] - - # Count of role 1 (child) - # hh 0 has person 1 (role 1) - res_count = link.count("2024", role=1) - assert numpy.array_equal(res_count, [1, 0, 0, 0]) - - # Sum salary of role 1 - res_sum = link.sum("salary", "2024", role=1) - assert numpy.array_equal(res_sum, [500.0, 0.0, 0.0, 0.0]) - - -def test_one2many_condition_filter(sim): - link = sim.populations["household"].links["members"] - condition = sim.calculate("is_female", "2024") - - # Sum of salary for females only - res_sum = link.sum("salary", "2024", condition=condition) - assert numpy.array_equal(res_sum, [1000.0, 0.0, 0.0, 0.0]) diff --git a/openfisca_core/links/tests/test_one2many_logic.py b/openfisca_core/links/tests/test_one2many_logic.py deleted file mode 100644 index e5047d58e..000000000 --- a/openfisca_core/links/tests/test_one2many_logic.py +++ /dev/null @@ -1,398 +0,0 @@ -"""Thorough logic tests for One2ManyLink aggregation methods. - -Tests cover: -- Combined role + condition filters -- All aggregation methods (sum, count, avg, min, max, any, all) with filters -- Edge cases: empty households, single-member households, negative values -- Vacuous truth for `all()` on empty groups -- `avg()` with zero-count groups (division by zero guard) -- Implicit One2Many links with filters -""" - -import numpy -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links import One2ManyLink -from openfisca_core.simulations import SimulationBuilder - - -@pytest.fixture -def rich_sim(): - """Simulation with 6 persons and 3 households, various edge cases. - - - hh 0: person 0 (role 0, parent, F, salary 1000), - person 1 (role 1, child, M, salary -200), - person 2 (role 1, child, F, salary 300) - - hh 1: person 3 (role 0, parent, M, salary 5000) - - hh 2: empty (no persons assigned) - - person 4: household_id = -1 (unattached) - - person 5: household_id = -1 (unattached) - """ - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", "households", "A household", "", roles=[{"key": "member"}] - ) - - members_link = One2ManyLink( - "members", "household_id", "person", role_field="household_role" - ) - household.add_link(members_link) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class is_female(variables.Variable): - value_type = bool - entity = person - definition_period = periods.DateUnit.YEAR - - class is_student(variables.Variable): - value_type = bool - entity = person - definition_period = periods.DateUnit.YEAR - - class household_id(variables.Variable): - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = -1 - - class household_role(variables.Variable): # 0: parent, 1: child - value_type = int - entity = person - definition_period = periods.DateUnit.ETERNITY - default_value = 0 - - for var in [salary, is_female, is_student, household_id, household_role]: - tbs.add_variable(var) - - sim = SimulationBuilder().build_default_simulation(tbs, count=6) - sim.set_input("household_id", "2024", [0, 0, 0, 1, -1, -1]) - sim.set_input("household_role", "2024", [0, 1, 1, 0, 0, 0]) - sim.set_input("salary", "2024", [1000.0, -200.0, 300.0, 5000.0, 99.0, 99.0]) - sim.set_input("is_female", "2024", [True, False, True, False, True, False]) - sim.set_input("is_student", "2024", [False, True, True, False, False, False]) - - return sim - - -# ────────────────────────────────────────────────────────────── -# 1. Combined role + condition (the bug that was fixed) -# ────────────────────────────────────────────────────────────── - - -class TestRoleAndConditionCombined: - """Verify that role and condition filters compose correctly.""" - - def test_sum_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - - # Sum salary of children (role=1) who are female - # hh 0: person 2 (role 1, F, 300) → 300 - # hh 1: nobody → 0 - # Others: 0 - res = link.sum("salary", "2024", role=1, condition=is_female) - expected = [300.0, 0.0, 0.0, 0.0, 0.0, 0.0] - numpy.testing.assert_array_almost_equal(res, expected) - - def test_count_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - - # Count children (role=1) who are female - # hh 0: person 2 → 1 - # hh 1: nobody → 0 - res = link.count("2024", role=1, condition=is_female) - expected = [1, 0, 0, 0, 0, 0] - numpy.testing.assert_array_equal(res, expected) - - def test_avg_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - - # Average salary of children (role=1) who are female - # hh 0: person 2 (300) → avg = 300 - # hh 1: nobody → 0 (division by zero guarded) - res = link.avg("salary", "2024", role=1, condition=is_female) - expected = [300.0, 0.0, 0.0, 0.0, 0.0, 0.0] - numpy.testing.assert_array_almost_equal(res, expected) - - def test_min_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_student = rich_sim.calculate("is_student", "2024") - - # Min salary of children (role=1) who are students - # hh 0: person 1 (-200, student), person 2 (300, student) → min = -200 - # hh 1: none → 0 - res = link.min("salary", "2024", role=1, condition=is_student) - expected = [-200.0, 0.0, 0.0, 0.0, 0.0, 0.0] - numpy.testing.assert_array_almost_equal(res, expected) - - def test_max_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_student = rich_sim.calculate("is_student", "2024") - - # Max salary of children (role=1) who are students - # hh 0: person 1 (-200), person 2 (300) → max = 300 - res = link.max("salary", "2024", role=1, condition=is_student) - expected = [300.0, 0.0, 0.0, 0.0, 0.0, 0.0] - numpy.testing.assert_array_almost_equal(res, expected) - - def test_any_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - - # Any child (role=1) female? - # hh 0: person 2 (child, F) → True - # hh 1: no children → False - res = link.any("is_female", "2024", role=1, condition=is_female) - expected = [True, False, False, False, False, False] - numpy.testing.assert_array_equal(res, expected) - - def test_all_role_and_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_student = rich_sim.calculate("is_student", "2024") - - # All children (role=1) students? - # hh 0: person 1 (student=T), person 2 (student=T) → True - # hh 1: no matching members → True (vacuously true) - res = link.all("is_student", "2024", role=1, condition=is_student) - # After filtering by role=1 AND condition=is_student, the remaining - # members all have is_student=True by construction. - # But hh 1 has no such members → stays True (vacuously true) - expected = [True, True, True, True, True, True] - numpy.testing.assert_array_equal(res, expected) - - -# ────────────────────────────────────────────────────────────── -# 2. Edge cases for aggregation logic -# ────────────────────────────────────────────────────────────── - - -class TestAggregationEdgeCases: - """Edge cases: empty groups, negative values, single-member groups.""" - - def test_sum_with_negative_values(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_sum = link.sum("salary", "2024") - # hh 0: 1000 + (-200) + 300 = 1100 - # hh 1: 5000 - # hh 2+: 0 - assert res_sum[0] == pytest.approx(1100.0) - assert res_sum[1] == pytest.approx(5000.0) - assert res_sum[2] == pytest.approx(0.0) - - def test_min_with_negative_values(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_min = link.min("salary", "2024") - # hh 0: min(1000, -200, 300) = -200 - # hh 1: 5000 - # hh 2: 0 (empty → sentinel replaced to 0) - assert res_min[0] == pytest.approx(-200.0) - assert res_min[1] == pytest.approx(5000.0) - assert res_min[2] == pytest.approx(0.0) - - def test_max_with_negative_values(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_max = link.max("salary", "2024") - # hh 0: max(1000, -200, 300) = 1000 - # hh 1: 5000 - # hh 2: 0 (empty → sentinel replaced to 0) - assert res_max[0] == pytest.approx(1000.0) - assert res_max[1] == pytest.approx(5000.0) - assert res_max[2] == pytest.approx(0.0) - - def test_avg_empty_household(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_avg = link.avg("salary", "2024") - # hh 2: empty → 0 (division by zero guarded) - assert res_avg[2] == pytest.approx(0.0) - - def test_avg_single_member_household(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_avg = link.avg("salary", "2024") - # hh 1: single member with salary 5000 → avg = 5000 - assert res_avg[1] == pytest.approx(5000.0) - - def test_avg_multi_member_household(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_avg = link.avg("salary", "2024") - # hh 0: (1000 + (-200) + 300) / 3 ≈ 366.67 - assert res_avg[0] == pytest.approx(1100.0 / 3.0) - - def test_count_empty_household(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_count = link.count("2024") - # hh 2: 0 members - assert res_count[2] == 0 - - def test_count_single_member_household(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res_count = link.count("2024") - # hh 1: 1 member - assert res_count[1] == 1 - - def test_all_empty_household_is_vacuously_true(self, rich_sim): - """For empty groups, `all()` returns True (vacuous truth).""" - link = rich_sim.populations["household"].links["members"] - res_all = link.all("is_female", "2024") - # hh 2: empty → all() returns True by convention - assert res_all[2] is numpy.bool_(True) - - def test_any_empty_household_is_false(self, rich_sim): - """For empty groups, `any()` returns False.""" - link = rich_sim.populations["household"].links["members"] - res_any = link.any("is_female", "2024") - # hh 2: empty → any() returns False - assert res_any[2] is numpy.bool_(False) - - -# ────────────────────────────────────────────────────────────── -# 3. Condition-only filters (no role) -# ────────────────────────────────────────────────────────────── - - -class TestConditionOnlyFilters: - """Test condition filter without role.""" - - def test_count_with_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - res = link.count("2024", condition=is_female) - # hh 0: person 0 (F), person 2 (F) → 2 - # hh 1: person 3 (M) → 0 - expected = [2, 0, 0, 0, 0, 0] - numpy.testing.assert_array_equal(res, expected) - - def test_min_with_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - res = link.min("salary", "2024", condition=is_female) - # hh 0: min(1000, 300) = 300 (only females) - assert res[0] == pytest.approx(300.0) - assert res[1] == pytest.approx(0.0) # no females in hh 1 - - def test_max_with_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - res = link.max("salary", "2024", condition=is_female) - # hh 0: max(1000, 300) = 1000 (only females) - assert res[0] == pytest.approx(1000.0) - assert res[1] == pytest.approx(0.0) # no females in hh 1 - - def test_avg_with_condition(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - res = link.avg("salary", "2024", condition=is_female) - # hh 0: (1000 + 300) / 2 = 650 - assert res[0] == pytest.approx(650.0) - # hh 1: no females → 0 - assert res[1] == pytest.approx(0.0) - - -# ────────────────────────────────────────────────────────────── -# 4. Role-only filters (no condition) -# ────────────────────────────────────────────────────────────── - - -class TestRoleOnlyFilters: - """Test role filter without condition.""" - - def test_sum_parents_only(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res = link.sum("salary", "2024", role=0) - # hh 0: person 0 (role 0, 1000) → 1000 - # hh 1: person 3 (role 0, 5000) → 5000 - assert res[0] == pytest.approx(1000.0) - assert res[1] == pytest.approx(5000.0) - - def test_sum_children_only(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - res = link.sum("salary", "2024", role=1) - # hh 0: person 1 (-200) + person 2 (300) = 100 - # hh 1: none → 0 - assert res[0] == pytest.approx(100.0) - assert res[1] == pytest.approx(0.0) - - def test_count_nonexistent_role(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - # Role 99 doesn't exist — count should be 0 everywhere - res = link.count("2024", role=99) - numpy.testing.assert_array_equal(res, [0, 0, 0, 0, 0, 0]) - - def test_any_parents_female(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - # Any parent female? - # hh 0: person 0 (parent, F) → True - # hh 1: person 3 (parent, M) → False - res = link.any("is_female", "2024", role=0) - assert res[0] is numpy.bool_(True) - assert res[1] is numpy.bool_(False) - - def test_min_max_single_member_role(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - # For a single-member group with the matching role, min == max == value - res_min = link.min("salary", "2024", role=0) - res_max = link.max("salary", "2024", role=0) - # hh 1: person 3 only parent → min = max = 5000 - assert res_min[1] == pytest.approx(5000.0) - assert res_max[1] == pytest.approx(5000.0) - - -# ────────────────────────────────────────────────────────────── -# 5. Cross-check: sum == count * avg (when count > 0) -# ────────────────────────────────────────────────────────────── - - -class TestCrossChecks: - """Verify mathematical relationships between aggregation methods.""" - - def test_sum_equals_count_times_avg(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - s = link.sum("salary", "2024") - c = link.count("2024") - a = link.avg("salary", "2024") - - for i in range(len(s)): - if c[i] > 0: - assert s[i] == pytest.approx(c[i] * a[i]) - - def test_min_le_avg_le_max(self, rich_sim): - link = rich_sim.populations["household"].links["members"] - mn = link.min("salary", "2024") - a = link.avg("salary", "2024") - mx = link.max("salary", "2024") - c = link.count("2024") - - for i in range(len(mn)): - if c[i] > 0: - assert mn[i] <= a[i] + 1e-10 - assert a[i] <= mx[i] + 1e-10 - - def test_sum_with_role_and_condition_equals_filtered_sum(self, rich_sim): - """Sum with filters == sum of manually filtered values.""" - link = rich_sim.populations["household"].links["members"] - is_female = rich_sim.calculate("is_female", "2024") - - # Via link - link_sum = link.sum("salary", "2024", role=1, condition=is_female) - - # Manual computation - salaries = rich_sim.calculate("salary", "2024") - hh_ids = rich_sim.calculate("household_id", "2024") - roles = rich_sim.calculate("household_role", "eternity") - n_hh = rich_sim.populations["household"].count - - manual_sum = numpy.zeros(n_hh) - for p in range(len(salaries)): - hh = hh_ids[p] - if hh >= 0 and hh < n_hh and roles[p] == 1 and is_female[p]: - manual_sum[hh] += salaries[p] - - numpy.testing.assert_array_almost_equal(link_sum[:n_hh], manual_sum) diff --git a/openfisca_core/populations/_core_population.py b/openfisca_core/populations/_core_population.py index 349964900..721e324f6 100644 --- a/openfisca_core/populations/_core_population.py +++ b/openfisca_core/populations/_core_population.py @@ -42,12 +42,6 @@ class CorePopulation: #: The :class:`~simulations.Simulation` for which the population is calculated. simulation: None | t.Simulation = None - # === Dynamic population helpers (optional) === - _dynamic: bool = False - _permanent_ids: None | numpy.ndarray = None - _id_to_rownum: None | numpy.ndarray = None - _next_id: None | int = None - _period_index: None | dict = None def __init__(self, entity: t.CoreEntity, *__args: object, **__kwds: object) -> None: self.entity = entity @@ -457,31 +451,5 @@ def get_memory_usage( by_variable=holders_memory_usage, ) - # -- Dynamic population period index helpers ------------------------- - - def snapshot_period(self, period: t.PeriodLike) -> None: - """Save the current ``_id_to_rownum`` mapping for ``period``. - - If no ``_id_to_rownum`` exists yet, a default identity mapping is - created (positions == ids) sized to ``self.count``. - """ - if self._period_index is None: - self._period_index = {} - - if self._id_to_rownum is None: - # Default identity mapping: id -> row (0..count-1) - self._id_to_rownum = numpy.arange(self.count, dtype=numpy.intp) - - self._period_index[periods.period(period)] = self._id_to_rownum.copy() - - def get_period_id_to_rownum(self, period: t.PeriodLike) -> None | numpy.ndarray: - """Return the saved ``id -> rownum`` mapping for ``period`` or ``None``. - - Returns ``None`` when no index was saved for that period. - """ - if self._period_index is None: - return None - return self._period_index.get(periods.period(period)) - __all__ = ["CorePopulation"] diff --git a/openfisca_core/populations/group_population.py b/openfisca_core/populations/group_population.py index a3be9539a..1fa5f32f4 100644 --- a/openfisca_core/populations/group_population.py +++ b/openfisca_core/populations/group_population.py @@ -96,42 +96,16 @@ def get_role(self, role_name): None, ) - # Filtering helpers - - def _build_member_mask(self, role=None, condition=None): - """Build a combined boolean mask from role and condition filters. - - Returns None if no filtering is needed (both are None), otherwise - returns a boolean array of length ``members.count``. - """ - if role is None and condition is None: - return None - - mask = numpy.ones(self.members.count, dtype=bool) - - if role is not None: - role_filter = self.members.has_role(role) - mask &= role_filter - - if condition is not None: - mask &= condition - - return mask - # Aggregation persons -> entity @projectors.projectable - def sum(self, array, role=None, condition=None): + def sum(self, array, role=None): """Return the sum of ``array`` for the members of the entity. ``array`` must have the dimension of the number of persons in the simulation If ``role`` is provided, only the entity member with the given role are taken into account. - If ``condition`` is provided (a boolean array of the same size as the - person population), only members for whom the condition is ``True`` are - taken into account. ``role`` and ``condition`` can be combined. - Example: >>> salaries = household.members( ... "salary", "2018-01" @@ -142,27 +116,23 @@ def sum(self, array, role=None, condition=None): """ self.entity.check_role_validity(role) self.members.check_array_compatible_with_entity(array) - mask = self._build_member_mask(role, condition) - if mask is not None: + if role is not None: + role_filter = self.members.has_role(role) return numpy.bincount( - self.members_entity_id[mask], - weights=array[mask], + self.members_entity_id[role_filter], + weights=array[role_filter], minlength=self.count, ) return numpy.bincount(self.members_entity_id, weights=array) @projectors.projectable - def any(self, array, role=None, condition=None): + def any(self, array, role=None): """Return ``True`` if ``array`` is ``True`` for any members of the entity. ``array`` must have the dimension of the number of persons in the simulation If ``role`` is provided, only the entity member with the given role are taken into account. - If ``condition`` is provided (a boolean array of the same size as the - person population), only members for whom the condition is ``True`` are - taken into account. ``role`` and ``condition`` can be combined. - Example: >>> salaries = household.members( ... "salary", "2018-01" @@ -171,18 +141,16 @@ def any(self, array, role=None, condition=None): >>> array([True]) """ - sum_in_entity = self.sum(array, role=role, condition=condition) + sum_in_entity = self.sum(array, role=role) return sum_in_entity > 0 @projectors.projectable - def reduce(self, array, reducer, neutral_element, role=None, condition=None): + def reduce(self, array, reducer, neutral_element, role=None): self.members.check_array_compatible_with_entity(array) self.entity.check_role_validity(role) position_in_entity = self.members_position - mask = self._build_member_mask(role, condition) - if mask is None: - mask = True # scalar True broadcasts; preserves old upcast behavior - filtered_array = numpy.where(mask, array, neutral_element) + role_filter = self.members.has_role(role) if role is not None else True + filtered_array = numpy.where(role_filter, array, neutral_element) result = self.filled_array( neutral_element, @@ -199,17 +167,13 @@ def reduce(self, array, reducer, neutral_element, role=None, condition=None): return result @projectors.projectable - def all(self, array, role=None, condition=None): + def all(self, array, role=None): """Return ``True`` if ``array`` is ``True`` for all members of the entity. ``array`` must have the dimension of the number of persons in the simulation If ``role`` is provided, only the entity member with the given role are taken into account. - If ``condition`` is provided (a boolean array of the same size as the - person population), only members for whom the condition is ``True`` are - taken into account. ``role`` and ``condition`` can be combined. - Example: >>> salaries = household.members( ... "salary", "2018-01" @@ -223,21 +187,16 @@ def all(self, array, role=None, condition=None): reducer=numpy.logical_and, neutral_element=True, role=role, - condition=condition, ) @projectors.projectable - def max(self, array, role=None, condition=None): + def max(self, array, role=None): """Return the maximum value of ``array`` for the entity members. ``array`` must have the dimension of the number of persons in the simulation If ``role`` is provided, only the entity member with the given role are taken into account. - If ``condition`` is provided (a boolean array of the same size as the - person population), only members for whom the condition is ``True`` are - taken into account. ``role`` and ``condition`` can be combined. - Example: >>> salaries = household.members( ... "salary", "2018-01" @@ -251,21 +210,16 @@ def max(self, array, role=None, condition=None): reducer=numpy.maximum, neutral_element=-numpy.inf, role=role, - condition=condition, ) @projectors.projectable - def min(self, array, role=None, condition=None): + def min(self, array, role=None): """Return the minimum value of ``array`` for the entity members. ``array`` must have the dimension of the number of persons in the simulation If ``role`` is provided, only the entity member with the given role are taken into account. - If ``condition`` is provided (a boolean array of the same size as the - person population), only members for whom the condition is ``True`` are - taken into account. ``role`` and ``condition`` can be combined. - Example: >>> salaries = household.members( ... "salary", "2018-01" @@ -283,18 +237,13 @@ def min(self, array, role=None, condition=None): reducer=numpy.minimum, neutral_element=numpy.inf, role=role, - condition=condition, ) @projectors.projectable - def nb_persons(self, role=None, condition=None): + def nb_persons(self, role=None): """Returns the number of persons contained in the entity. If ``role`` is provided, only the entity member with the given role are taken into account. - - If ``condition`` is provided (a boolean array of the same size as the - person population), only members for whom the condition is ``True`` are - counted. ``role`` and ``condition`` can be combined. """ if role: if role.subroles: @@ -303,9 +252,7 @@ def nb_persons(self, role=None, condition=None): ) else: role_condition = self.members_role == role - return self.sum(role_condition, condition=condition) - if condition is not None: - return self.sum(condition.astype(float)).astype(int) + return self.sum(role_condition) return numpy.bincount(self.members_entity_id) # Projection person -> entity diff --git a/openfisca_core/populations/population.py b/openfisca_core/populations/population.py index 9347fbd68..24742ab0a 100644 --- a/openfisca_core/populations/population.py +++ b/openfisca_core/populations/population.py @@ -24,9 +24,6 @@ def clone(self, simulation: t.Simulation) -> t.CorePopulation: return result def __getattr__(self, attribute: str) -> projectors.Projector: - if hasattr(self, "links") and attribute in self.links: - return self.links[attribute] - projector: projectors.Projector | None projector = projectors.get_projector_from_shortcut(self, attribute) @@ -114,10 +111,11 @@ def get_rank( """ # If entity is for instance 'person.household', we get the reference entity 'household' behind the projector - if isinstance(entity, projectors.Projector): - entity = entity.reference_entity - elif hasattr(entity, "_target_population"): # Handle new Link system - entity = entity._target_population + entity = ( + entity + if not isinstance(entity, projectors.Projector) + else entity.reference_entity + ) positions = entity.members_position biggest_entity_size = numpy.max(positions) + 1 diff --git a/openfisca_core/simulations/_build_default_simulation.py b/openfisca_core/simulations/_build_default_simulation.py index ac03ba1b5..adc7cf478 100644 --- a/openfisca_core/simulations/_build_default_simulation.py +++ b/openfisca_core/simulations/_build_default_simulation.py @@ -30,7 +30,6 @@ class _BuildDefaultSimulation: ... .add_count() ... .add_ids() ... .add_members_entity_id() - ... .add_id_to_rownum() ... ) >>> builder.count @@ -122,16 +121,6 @@ def add_ids(self) -> Self: return self - def add_id_to_rownum(self) -> Self: - """Set identity id_to_rownum mapping on all populations. - - For static simulations, each entity's permanent ID equals its row - position, so id_to_rownum is the identity: id_to_rownum[i] = i. - """ - for population in self.populations.values(): - population._id_to_rownum = numpy.arange(self.count, dtype=numpy.intp) - return self - def add_members_entity_id(self) -> Self: """Add ??? diff --git a/openfisca_core/simulations/simulation.py b/openfisca_core/simulations/simulation.py index be5948919..fe17a31db 100644 --- a/openfisca_core/simulations/simulation.py +++ b/openfisca_core/simulations/simulation.py @@ -47,7 +47,6 @@ def __init__( self.persons = self.populations[tax_benefit_system.person_entity.key] self.link_to_entities_instances() self.create_shortcuts() - self._resolve_links() self.invalidated_caches = set() @@ -83,53 +82,6 @@ def create_shortcuts(self) -> None: # create shortcut simulation.person and simulation.household (for instance) setattr(self, population.entity.key, population) - def _resolve_links(self) -> None: - """Attach and resolve all links declared on every entity. - - This also auto-generates implicit links for each GroupEntity: - - A Many2One link on the person entity (e.g. ``person.household``) - - A One2Many link on the group entity (e.g. ``household.members``) - - This is called once at ``__init__`` time, after - ``link_to_entities_instances`` and ``create_shortcuts``. - """ - from openfisca_core.links.implicit import ( - ImplicitMany2OneLink, - ImplicitOne2ManyLink, - ) - from openfisca_core.populations.group_population import GroupPopulation - - person_entity = self.persons.entity - - # Auto-generate implicit links - for population in self.populations.values(): - if not isinstance(population, GroupPopulation): - continue - group_key = population.entity.key - - # person -> group (Many2One) - if not person_entity.get_link(group_key): - m2o = ImplicitMany2OneLink(group_key) - person_entity.add_link(m2o) - - # group -> persons (One2Many) - o2m_name = person_entity.plural - if not population.entity.get_link(o2m_name): - o2m = ImplicitOne2ManyLink(o2m_name, group_key) - population.entity.add_link(o2m) - - from copy import copy - - # Attach and resolve all links, making a simulation-local copy - for _key, population in self.populations.items(): - entity = population.entity - population.links = {} - for name, link in entity.links.items(): - bound_link = copy(link) - bound_link.attach(population) - bound_link.resolve(self.populations) - population.links[name] = bound_link - @property def data_storage_dir(self): """Temporary folder used to store intermediate calculation data in case the memory is saturated.""" @@ -185,10 +137,6 @@ def _calculate(self, variable_name: str, period: periods.Period): self._check_period_consistency(period, variable) - # Transition formula path: sparse, as_of-only, needs separate cache tracking. - if variable.has_transition_formula: - return self._calculate_transition(variable, population, holder, period) - # First look for a value already cached cached_array = holder.get_array(period) if cached_array is not None: @@ -384,103 +332,6 @@ def trace_parameters_at_instant(self, formula_period): self.tracer, ) - def _calculate_transition(self, variable, population, holder, period): - """Calculation path for as_of variables with transition_formula. - - Runs the formula at most once per instant, stores the sparse patch via - _set_as_of_sparse, then returns the reconstructed state. - """ - instant = period.start if variable.as_of == "start" else period.stop - - # Already computed for this instant → return current state. - if instant in holder._as_of_transition_computed: - result = holder.get_array(period) - return result if result is not None else holder.default_array() - - # start_computation_period guard. - if self.start_computation_period is not None: - if not isinstance(self.start_computation_period, periods.Period): - self.start_computation_period = periods.period( - self.start_computation_period - ) - if period < self.start_computation_period: - holder._as_of_transition_computed.add(instant) - result = holder.get_array(period) - return result if result is not None else holder.default_array() - - # No base established yet: run initial_formula if defined, else raise. - if holder._as_of_base is None: - initial_formula = variable.get_initial_formula(period) - if initial_formula is not None: - self.tracer.record_formula_type("initial") - array = self._run_initial_formula(initial_formula, population, period) - array = self._cast_formula_result(array, variable) - holder.set_input(period, array) - holder._as_of_transition_computed.add(instant) - return array - raise ValueError( - f'Variable "{variable.name}" has no initial state for period {period}. ' - f"Either call set_input() before the simulation starts, " - f"or define an initial_formula on the variable." - ) - - formula = variable.get_transition_formula(period) - if formula is not None: - try: - # Use strict cycle check only: reading the same variable at a - # different period (e.g. period.last_month) is legitimate for - # as_of variables — termination is guaranteed by - # _as_of_transition_computed. SpiralError is NOT raised here. - self._check_for_strict_cycle(variable.name, period) - self.tracer.record_formula_type("transition") - result = self._run_transition_formula(formula, population, period) - - if result is not None: - selector, vals = result - if hasattr(selector, "dtype") and selector.dtype == numpy.bool_: - idx = numpy.where(selector)[0].astype(numpy.int32) - else: - idx = numpy.asarray(selector, dtype=numpy.int32) - if numpy.isscalar(vals): - vals = numpy.full(len(idx), vals, dtype=variable.dtype) - else: - vals = numpy.asarray(vals, dtype=variable.dtype) - if len(vals) != len(idx): - raise ValueError( - f'transition_formula of "{variable.name}" returned ' - f"{len(vals)} values for {len(idx)} selected indices." - ) - holder._set_as_of_sparse(period, idx, vals) - - except errors.CycleError: - pass # no patch stored, previous state persists - - holder._as_of_transition_computed.add(instant) - result = holder.get_array(period) - return result if result is not None else holder.default_array() - - def _run_transition_formula(self, formula, population, period): - """Call a transition_formula and return its (selector, vals) result.""" - if self.trace: - parameters_at = self.trace_parameters_at_instant - else: - parameters_at = self.tax_benefit_system.get_parameters_at_instant - - if formula.__code__.co_argcount == 2: - return formula(population, period) - return formula(population, period, parameters_at) - - def _run_initial_formula(self, formula, population, period): - """Call an initial_formula and return a full dense array.""" - if self.trace: - parameters_at = self.trace_parameters_at_instant - else: - parameters_at = self.tax_benefit_system.get_parameters_at_instant - - if formula.__code__.co_argcount == 2: - return formula(population, period) - return formula(population, period, parameters_at) - def _run_formula(self, variable, population, period): """Find the ``variable`` formula for the given ``period`` if it exists, and apply it to ``population``.""" formula = variable.get_formula(period) @@ -578,19 +429,6 @@ def _check_for_cycle(self, variable: str, period) -> None: message = f"Quasicircular definition detected on formula {variable}@{period} involving {self.tracer.stack}" raise errors.SpiralError(message, variable) - def _check_for_strict_cycle(self, variable: str, period) -> None: - """Raise CycleError if variable@period is already on the stack. - - Unlike _check_for_cycle, this does NOT raise SpiralError for the same - variable at different periods. Used by transition_formula where reading - the previous period's value is legitimate and termination is guaranteed - by _as_of_transition_computed. - """ - for frame in self.tracer.stack[:-1]: - if frame["name"] == variable and frame["period"] == period: - msg = f"Circular definition detected on transition_formula {variable}@{period}" - raise errors.CycleError(msg) - def invalidate_cache_entry(self, variable: str, period) -> None: self.invalidated_caches.add(Cache(variable, period)) diff --git a/openfisca_core/simulations/simulation_builder.py b/openfisca_core/simulations/simulation_builder.py index 636ad7fd4..7464b4650 100644 --- a/openfisca_core/simulations/simulation_builder.py +++ b/openfisca_core/simulations/simulation_builder.py @@ -327,7 +327,6 @@ def build_default_simulation( .add_count() .add_ids() .add_members_entity_id() - .add_id_to_rownum() .simulation ) @@ -660,7 +659,6 @@ def finalize_variables_init(self, population) -> None: if plural_key in self.entity_counts: population.count = self.get_count(plural_key) population.ids = self.get_ids(plural_key) - population._id_to_rownum = numpy.arange(population.count, dtype=numpy.intp) if plural_key in self.memberships: population.members_entity_id = numpy.array(self.get_memberships(plural_key)) population.members_role = numpy.array(self.get_roles(plural_key)) diff --git a/openfisca_core/taxscales/abstract_rate_tax_scale.py b/openfisca_core/taxscales/abstract_rate_tax_scale.py index 2d2731e6e..9d828ed67 100644 --- a/openfisca_core/taxscales/abstract_rate_tax_scale.py +++ b/openfisca_core/taxscales/abstract_rate_tax_scale.py @@ -36,7 +36,7 @@ def calc( tax_base: NumericalArray, right: bool, ) -> typing.NoReturn: - msg = f"Method 'calc' is not implemented for {self.__class__.__name__}" + msg = "Method 'calc' is not implemented for " f"{self.__class__.__name__}" raise NotImplementedError( msg, ) diff --git a/openfisca_core/taxscales/abstract_tax_scale.py b/openfisca_core/taxscales/abstract_tax_scale.py index af42ba488..de9a6348c 100644 --- a/openfisca_core/taxscales/abstract_tax_scale.py +++ b/openfisca_core/taxscales/abstract_tax_scale.py @@ -32,7 +32,7 @@ def __init__( super().__init__(name, option, unit) def __repr__(self) -> typing.NoReturn: - msg = f"Method '__repr__' is not implemented for {self.__class__.__name__}" + msg = "Method '__repr__' is not implemented for " f"{self.__class__.__name__}" raise NotImplementedError( msg, ) @@ -42,7 +42,7 @@ def calc( tax_base: NumericalArray, right: bool, ) -> typing.NoReturn: - msg = f"Method 'calc' is not implemented for {self.__class__.__name__}" + msg = "Method 'calc' is not implemented for " f"{self.__class__.__name__}" raise NotImplementedError( msg, ) diff --git a/openfisca_core/taxscales/tax_scale_like.py b/openfisca_core/taxscales/tax_scale_like.py index 691ba0cf3..e8680b9f8 100644 --- a/openfisca_core/taxscales/tax_scale_like.py +++ b/openfisca_core/taxscales/tax_scale_like.py @@ -36,13 +36,13 @@ def __init__( self.thresholds = [] def __eq__(self, _other: object) -> typing.NoReturn: - msg = f"Method '__eq__' is not implemented for {self.__class__.__name__}" + msg = "Method '__eq__' is not implemented for " f"{self.__class__.__name__}" raise NotImplementedError( msg, ) def __ne__(self, _other: object) -> typing.NoReturn: - msg = f"Method '__ne__' is not implemented for {self.__class__.__name__}" + msg = "Method '__ne__' is not implemented for " f"{self.__class__.__name__}" raise NotImplementedError( msg, ) diff --git a/openfisca_core/tracers/computation_log.py b/openfisca_core/tracers/computation_log.py index ea46befa8..96765935a 100644 --- a/openfisca_core/tracers/computation_log.py +++ b/openfisca_core/tracers/computation_log.py @@ -20,7 +20,6 @@ def lines( max_depth: int = sys.maxsize, ignore_default: bool = False, tax_benefit_system: t.TaxBenefitSystem | None = None, - show_formula_type: bool = False, ) -> list[str]: """Generate lines for the computation log. @@ -38,13 +37,7 @@ def lines( lines_by_tree = [ self._get_node_log( - node, - depth, - aggregate, - max_depth, - ignore_default, - tax_benefit_system, - show_formula_type, + node, depth, aggregate, max_depth, ignore_default, tax_benefit_system ) for node in self._full_tracer.trees ] @@ -57,7 +50,6 @@ def print_log( max_depth: int = sys.maxsize, ignore_default: bool = False, tax_benefit_system: t.TaxBenefitSystem | None = None, - show_formula_type: bool = False, ) -> None: """Print the computation log of a simulation. @@ -80,7 +72,7 @@ def print_log( its children are also hidden, even if they have non-default values. """ for line in self.lines( - aggregate, max_depth, ignore_default, tax_benefit_system, show_formula_type + aggregate, max_depth, ignore_default, tax_benefit_system ): print(line) # noqa: T201 @@ -92,7 +84,6 @@ def _get_node_log( max_depth: int = sys.maxsize, ignore_default: bool = False, tax_benefit_system: t.TaxBenefitSystem | None = None, - show_formula_type: bool = False, ) -> list[str]: if depth > max_depth: return [] @@ -102,7 +93,7 @@ def _get_node_log( # Don't display this node or its children return [] - node_log = [self._print_line(depth, node, aggregate, show_formula_type)] + node_log = [self._print_line(depth, node, aggregate)] children_logs = [ self._get_node_log( @@ -112,7 +103,6 @@ def _get_node_log( max_depth, ignore_default, tax_benefit_system, - show_formula_type, ) for child in node.children ] @@ -167,13 +157,7 @@ def _is_default_value( # If we can't determine, assume not default to be safe return False - def _print_line( - self, - depth: int, - node: t.TraceNode, - aggregate: bool, - show_formula_type: bool = False, - ) -> str: + def _print_line(self, depth: int, node: t.TraceNode, aggregate: bool) -> str: indent = " " * depth value = node.value @@ -198,12 +182,7 @@ def _print_line( else: formatted_value = self.display(value) - type_tag = ( - f" [{node.formula_type}]" - if show_formula_type and node.formula_type is not None - else "" - ) - return f"{indent}{node.name}<{node.period}>{type_tag} >> {formatted_value}" + return f"{indent}{node.name}<{node.period}> >> {formatted_value}" @staticmethod def display(value: t.VarArray, max_depth: int = sys.maxsize) -> str: diff --git a/openfisca_core/tracers/full_tracer.py b/openfisca_core/tracers/full_tracer.py index 7aeda02d9..7a49aa487 100644 --- a/openfisca_core/tracers/full_tracer.py +++ b/openfisca_core/tracers/full_tracer.py @@ -69,10 +69,6 @@ def record_parameter_access( TraceNode(name=parameter, period=period, value=value), ) - def record_formula_type(self, formula_type: str) -> None: - if self._current_node is not None: - self._current_node.formula_type = formula_type - def record_calculation_result(self, value: t.VarArray) -> None: if self._current_node is not None: self._current_node.value = value diff --git a/openfisca_core/tracers/simple_tracer.py b/openfisca_core/tracers/simple_tracer.py index 1dbde407e..d096b0311 100644 --- a/openfisca_core/tracers/simple_tracer.py +++ b/openfisca_core/tracers/simple_tracer.py @@ -37,9 +37,6 @@ def record_calculation_start( """ self.stack.append({"name": variable, "period": period}) - def record_formula_type(self, formula_type: str) -> None: - """Ignore formula type (no-op for SimpleTracer).""" - def record_calculation_result(self, value: t.ArrayLike[object]) -> None: """Ignore calculation result.""" diff --git a/openfisca_core/tracers/trace_node.py b/openfisca_core/tracers/trace_node.py index 522256a14..8e8ce1fdc 100644 --- a/openfisca_core/tracers/trace_node.py +++ b/openfisca_core/tracers/trace_node.py @@ -27,11 +27,6 @@ class TraceNode: #: The value of the node. value: None | t.VarArray = None - #: The type of formula that produced this node. - #: None for regular formulas, "transition" for transition_formula, - #: "initial" for initial_formula. - formula_type: str | None = None - #: The start time of the node. start: t.Time = 0.0 diff --git a/openfisca_core/types.py b/openfisca_core/types.py index 771cfdbdf..3a4c0a468 100644 --- a/openfisca_core/types.py +++ b/openfisca_core/types.py @@ -475,9 +475,6 @@ def children(self, /) -> list[TraceNode]: ... @property def end(self, /) -> Time: ... - @property - def formula_type(self, /) -> str | None: ... - @property def name(self, /) -> str: ... diff --git a/openfisca_core/variables/config.py b/openfisca_core/variables/config.py index e83d41a36..54270145b 100644 --- a/openfisca_core/variables/config.py +++ b/openfisca_core/variables/config.py @@ -51,5 +51,3 @@ FORMULA_NAME_PREFIX = "formula" -INITIAL_FORMULA_NAME_PREFIX = "initial_formula" -TRANSITION_FORMULA_NAME_PREFIX = "transition_formula" diff --git a/openfisca_core/variables/variable.py b/openfisca_core/variables/variable.py index 4fd6984cd..b8026c22c 100644 --- a/openfisca_core/variables/variable.py +++ b/openfisca_core/variables/variable.py @@ -174,68 +174,12 @@ def __init__(self, baseline_variable=None) -> None: "introspection_data", ) - self.as_of = self.set( - attr, - "as_of", - setter=self.set_as_of, - ) - - self.snapshot_count = self.set( - attr, - "snapshot_count", - allowed_type=int, - ) - - if self.as_of and self.set_input: - msg = ( - f'Variable "{self.name}" declares both as_of and set_input, ' - f"which are incompatible. set_input helpers like " - f"set_input_divide_by_period scatter values across sub-periods " - f"and have no meaningful semantics for as_of variables." - ) - raise ValueError(msg) - - # Partition transition_formula* and initial_formula* before formula* so - # they don't land in unexpected_attrs. - transition_formulas_attr, attr = helpers._partition( - attr, - lambda name, value: name.startswith(config.TRANSITION_FORMULA_NAME_PREFIX), - ) - self.transition_formulas = self.set_transition_formulas( - transition_formulas_attr - ) - - if self.transition_formulas: - if not self.as_of: - raise ValueError( - f'Variable "{self.name}" declares transition_formula but not as_of. ' - f"transition_formula is reserved for as_of variables." - ) - - initial_formulas_attr, attr = helpers._partition( - attr, - lambda name, value: name.startswith(config.INITIAL_FORMULA_NAME_PREFIX), - ) - self.initial_formulas = self.set_initial_formulas(initial_formulas_attr) - - if self.initial_formulas and not self.as_of: - raise ValueError( - f'Variable "{self.name}" declares initial_formula but not as_of. ' - f"initial_formula is reserved for as_of variables." - ) - formulas_attr, unexpected_attrs = helpers._partition( attr, lambda name, value: name.startswith(config.FORMULA_NAME_PREFIX), ) self.formulas = self.set_formulas(formulas_attr) - if self.formulas and self.transition_formulas: - raise ValueError( - f'Variable "{self.name}" declares both formula and transition_formula. ' - f"They are mutually exclusive." - ) - if unexpected_attrs: msg = 'Unexpected attributes in definition of variable "{}": {!r}'.format( self.name, @@ -353,19 +297,6 @@ def set_documentation(self, documentation): return textwrap.dedent(documentation) return None - def set_as_of(self, value): - if value is None or value is False: - return False - if value is True or value == "start": - return "start" - if value == "end": - return "end" - msg = ( - f"Invalid value '{value}' for attribute 'as_of' in variable " - f"'{self.name}'. Allowed values are: True, 'start', 'end' (or False to disable)." - ) - raise ValueError(msg) - def set_set_input(self, set_input): if not set_input and self.baseline_variable: return self.baseline_variable.set_input @@ -403,148 +334,6 @@ def set_formulas(self, formulas_attr): return formulas - def set_transition_formulas(self, transition_formulas_attr): - transition_formulas = sortedcontainers.sorteddict.SortedDict() - for formula_name, formula in transition_formulas_attr.items(): - starting_date = self.parse_transition_formula_name(formula_name) - - if self.end is not None and starting_date > self.end: - msg = f'You declared that "{self.name}" ends on "{self.end}", but you wrote a transition_formula from "{starting_date}" ({formula_name}).' - raise ValueError(msg) - - transition_formulas[str(starting_date)] = formula - - if self.baseline_variable is not None and hasattr( - self.baseline_variable, "transition_formulas" - ): - first_reform_date = ( - transition_formulas.peekitem(0)[0] if transition_formulas else None - ) - transition_formulas.update( - { - baseline_date: baseline_formula - for baseline_date, baseline_formula in self.baseline_variable.transition_formulas.items() - if first_reform_date is None or baseline_date < first_reform_date - } - ) - - return transition_formulas - - def set_initial_formulas(self, initial_formulas_attr): - initial_formulas = sortedcontainers.sorteddict.SortedDict() - for formula_name, formula in initial_formulas_attr.items(): - starting_date = self.parse_initial_formula_name(formula_name) - initial_formulas[str(starting_date)] = formula - - if self.baseline_variable is not None and hasattr( - self.baseline_variable, "initial_formulas" - ): - first_reform_date = ( - initial_formulas.peekitem(0)[0] if initial_formulas else None - ) - initial_formulas.update( - { - baseline_date: baseline_formula - for baseline_date, baseline_formula in self.baseline_variable.initial_formulas.items() - if first_reform_date is None or baseline_date < first_reform_date - } - ) - - return initial_formulas - - def parse_initial_formula_name(self, attribute_name): - """Returns the starting date of an initial_formula based on its name. - - Valid formats: 'initial_formula', 'initial_formula_YYYY', - 'initial_formula_YYYY_MM', 'initial_formula_YYYY_MM_DD'. - """ - - def raise_error() -> NoReturn: - msg = ( - f'Unrecognized initial_formula name in variable "{self.name}". ' - f'Expecting "initial_formula_YYYY" or "initial_formula_YYYY_MM" ' - f'or "initial_formula_YYYY_MM_DD". Found: "{attribute_name}".' - ) - raise ValueError(msg) - - if attribute_name == config.INITIAL_FORMULA_NAME_PREFIX: - return datetime.date.min - - INITIAL_FORMULA_REGEX = r"initial_formula_(\d{4})(?:_(\d{2}))?(?:_(\d{2}))?$" - match = re.match(INITIAL_FORMULA_REGEX, attribute_name) - if not match: - raise_error() - date_str = "-".join( - [match.group(1), match.group(2) or "01", match.group(3) or "01"], - ) - try: - return datetime.date.fromisoformat(date_str) - except ValueError: - raise_error() - - def get_initial_formula(self, period=None): - """Returns the initial_formula applicable at the given period.""" - if not self.initial_formulas: - return None - - if period is None: - return self.initial_formulas.peekitem(index=0)[1] - - if isinstance(period, Period): - instant = period.start - else: - try: - instant = periods.period(period).start - except ValueError: - instant = periods.instant(period) - - if instant is None: - return None - - instant_str = str(instant) - for start_date in reversed(self.initial_formulas): - if start_date <= instant_str: - return self.initial_formulas[start_date] - - return None - - @property - def has_initial_formula(self) -> bool: - """True if the variable defines an initial_formula.""" - return bool(self.initial_formulas) - - def parse_transition_formula_name(self, attribute_name): - """Returns the starting date of a transition_formula based on its name. - - Valid formats: 'transition_formula', 'transition_formula_YYYY', - 'transition_formula_YYYY_MM', 'transition_formula_YYYY_MM_DD'. - """ - - def raise_error() -> NoReturn: - msg = ( - f'Unrecognized transition_formula name in variable "{self.name}". ' - f'Expecting "transition_formula_YYYY" or "transition_formula_YYYY_MM" ' - f'or "transition_formula_YYYY_MM_DD". Found: "{attribute_name}".' - ) - raise ValueError(msg) - - if attribute_name == config.TRANSITION_FORMULA_NAME_PREFIX: - return datetime.date.min - - TRANSITION_FORMULA_REGEX = ( - r"transition_formula_(\d{4})(?:_(\d{2}))?(?:_(\d{2}))?$" - ) - match = re.match(TRANSITION_FORMULA_REGEX, attribute_name) - if not match: - raise_error() - date_str = "-".join( - [match.group(1), match.group(2) or "01", match.group(3) or "01"], - ) - try: - return datetime.datetime.strptime(date_str, "%Y-%m-%d").date() - except ValueError: - raise_error() - def parse_formula_name(self, attribute_name): """Returns the starting date of a formula based on its name. @@ -583,7 +372,7 @@ def raise_error() -> NoReturn: def is_input_variable(self): """Returns True if the variable is an input variable.""" - return len(self.formulas) == 0 and len(self.transition_formulas) == 0 + return len(self.formulas) == 0 @classmethod def get_introspection_data(cls): @@ -642,44 +431,6 @@ def get_formula( return None - def get_transition_formula( - self, - period: None | t.Instant | t.Period | str | int = None, - ) -> None | t.Formula: - """Returns the transition_formula applicable at the given period.""" - if not self.transition_formulas: - return None - - if period is None: - return self.transition_formulas.peekitem(index=0)[1] - - if isinstance(period, Period): - instant = period.start - else: - try: - instant = periods.period(period).start - except ValueError: - instant = periods.instant(period) - - if instant is None: - return None - - if self.end and instant.date > self.end: - return None - - instant_str = str(instant) - - for start_date in reversed(self.transition_formulas): - if start_date <= instant_str: - return self.transition_formulas[start_date] - - return None - - @property - def has_transition_formula(self) -> bool: - """True if the variable uses transition_formula instead of formula.""" - return bool(self.transition_formulas) - def clone(self): return self.__class__() diff --git a/pyproject.toml b/pyproject.toml index b2b82088e..1add2309f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,4 +11,4 @@ ignore-words-list = [ "impot", "treshold", ] -skip = "./venv,./docs/implementation" +skip = "./venv" diff --git a/setup.py b/setup.py index eb0585e46..5d1716533 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,6 @@ dev_requirements = [ "black >=24.8.0, <25.0", - "pytest-benchmark >=4.0.0, <5.0", "codespell >=2.3.0, <3.0", "colorama >=0.4.4, <0.5", "darglint >=1.8.1, <2.0", @@ -85,7 +84,7 @@ setup( name="OpenFisca-Core", - version="44.5.0", + version="44.6.0", author="OpenFisca Team", author_email="contact@openfisca.org", classifiers=[ diff --git a/tasks/lint.mk b/tasks/lint.mk index a7e7cda4f..532518dc7 100644 --- a/tasks/lint.mk +++ b/tasks/lint.mk @@ -1,5 +1,3 @@ -PYTHON ?= python - ## Lint the codebase. lint: check-syntax-errors check-style lint-doc @$(call print_pass,$@:) @@ -7,15 +5,15 @@ lint: check-syntax-errors check-style lint-doc ## Compile python files to check for syntax errors. check-syntax-errors: . @$(call print_help,$@:) - @$(PYTHON) -m compileall -q $? + @python -m compileall -q $? @$(call print_pass,$@:) ## Run linters to check for syntax and style errors. check-style: $(shell git ls-files "*.py" "*.pyi") @$(call print_help,$@:) - @$(PYTHON) -m isort --check $? - @$(PYTHON) -m black --check $? - @$(PYTHON) -m flake8 $? + @python -m isort --check $? + @python -m black --check $? + @python -m flake8 $? @codespell @$(call print_pass,$@:) @@ -37,14 +35,14 @@ lint-doc-%: @## able to integrate documentation improvements progressively. @## @$(call print_help,$(subst $*,%,$@:)) - @$(PYTHON) -m flake8 --select=D101,D102,D103,DAR openfisca_core/$* - @$(PYTHON) -m pylint openfisca_core/$* + @python -m flake8 --select=D101,D102,D103,DAR openfisca_core/$* + @python -m pylint openfisca_core/$* @$(call print_pass,$@:) ## Run static type checkers for type errors. check-types: @$(call print_help,$@:) - @$(PYTHON) -m mypy \ + @python -m mypy \ openfisca_core/commons \ openfisca_core/data_storage \ openfisca_core/experimental \ @@ -57,7 +55,7 @@ check-types: ## Run code formatters to correct style errors. format-style: $(shell git ls-files "*.py" "*.pyi") @$(call print_help,$@:) - @$(PYTHON) -m isort $? - @$(PYTHON) -m black $? + @python -m isort $? + @python -m black $? @codespell --write-changes @$(call print_pass,$@:) diff --git a/tests/core/parameters_date_indexing/test_date_indexing.py b/tests/core/parameters_date_indexing/test_date_indexing.py index e6c7303c7..742c426a7 100644 --- a/tests/core/parameters_date_indexing/test_date_indexing.py +++ b/tests/core/parameters_date_indexing/test_date_indexing.py @@ -5,6 +5,8 @@ from openfisca_core.parameters import ParameterNode from openfisca_core.tools import assert_near +from openfisca_core.model_api import * # noqa + LOCAL_DIR = os.path.dirname(os.path.abspath(__file__)) parameters = ParameterNode(directory_path=LOCAL_DIR) diff --git a/tests/core/test_asof_variable.py b/tests/core/test_asof_variable.py deleted file mode 100644 index 875695b22..000000000 --- a/tests/core/test_asof_variable.py +++ /dev/null @@ -1,563 +0,0 @@ -"""Tests for the as_of variable feature. - -An as_of variable's value, once set at a given instant, persists forward in -time until explicitly overridden. Values are stored as a base array + -sparse patches (changed indices/values only); the snapshot cursor makes -forward-sequential reads incremental. - -Formulas and aggregations are completely unaware of the mechanism. -""" - -from __future__ import annotations - -import numpy -import pytest - -from openfisca_core.entities import Entity -from openfisca_core.holders import Holder -from openfisca_core.periods import DateUnit, period -from openfisca_core.populations import Population -from openfisca_core.variables import Variable - -# --------------------------------------------------------------------------- -# Minimal test fixtures – no country-template dependency -# --------------------------------------------------------------------------- - -_entity = Entity("person", "persons", "", "") - - -class _AsOfIntVariable(Variable): - """A simple integer variable that persists (as_of = 'start').""" - - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - -class _AsOfEndVariable(Variable): - """Same but uses the end-of-period convention.""" - - entity = _entity - definition_period = DateUnit.YEAR - value_type = int - as_of = "end" - - -class _RegularVariable(Variable): - """A normal variable without as_of semantics.""" - - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - - -def _make_holder(variable_class, count=2): - """Return a ready-to-use Holder with *count* individuals.""" - population = Population(_entity) - population.simulation = None - population.count = count - return Holder(variable_class(), population) - - -def _make_holder_with_memory_config(variable_class, asof_max_snapshots, count=2): - """Return a Holder whose simulation carries a MemoryConfig stub.""" - var = variable_class() - - class _StubMemoryConfig: - # Put the variable in priority_variables to skip disk-storage creation. - priority_variables: frozenset = frozenset({var.name}) - variables_to_drop: frozenset = frozenset() - asof_max_snapshots = None # overridden below - - _StubMemoryConfig.asof_max_snapshots = asof_max_snapshots - - class _StubSimulation: - memory_config = _StubMemoryConfig() - - population = Population(_entity) - population.simulation = _StubSimulation() - population.count = count - return Holder(var, population) - - -# --------------------------------------------------------------------------- -# 1. Value persists forward in time -# --------------------------------------------------------------------------- - - -def test_asof_persists_forward(): - """Value set in Jan 2024 should be returned for Feb and Mar 2024.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([10, 20])) - - for month in ("2024-02", "2024-03", "2024-06", "2024-12"): - result = holder.get_array(period(month)) - numpy.testing.assert_array_equal( - result, - [10, 20], - err_msg=f"Expected persisted value for {month}", - ) - - -# --------------------------------------------------------------------------- -# 2. No value stored before the first stored instant → None -# --------------------------------------------------------------------------- - - -def test_asof_no_value_before_first_stored(): - """get_array returns None for any period before the first stored instant.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-06", numpy.array([1, 2])) - - assert holder.get_array(period("2024-01")) is None - assert holder.get_array(period("2024-05")) is None - - -# --------------------------------------------------------------------------- -# 3. Exact match returns the correct value (via snapshot cursor) -# --------------------------------------------------------------------------- - - -def test_asof_exact_match_returns_stored_value(): - """get_array for the exact base period returns the base value.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-03", numpy.array([7, 8])) - - result = holder.get_array(period("2024-03")) - numpy.testing.assert_array_equal(result, [7, 8]) - - -# --------------------------------------------------------------------------- -# 4. Most-recent stored value wins -# --------------------------------------------------------------------------- - - -def test_asof_takes_most_recent_value(): - """With two stored values, the one closest to (but not after) target wins.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([1, 1])) - holder.set_input("2024-04", numpy.array([4, 4])) - - # Before first stored instant → None - assert holder.get_array(period("2023-12")) is None - - # At or after first, before second → first value - for month in ("2024-01", "2024-02", "2024-03"): - numpy.testing.assert_array_equal( - holder.get_array(period(month)), - [1, 1], - err_msg=f"Expected first value for {month}", - ) - - # At or after second → second value - for month in ("2024-04", "2024-05", "2025-01"): - numpy.testing.assert_array_equal( - holder.get_array(period(month)), - [4, 4], - err_msg=f"Expected second value for {month}", - ) - - -# --------------------------------------------------------------------------- -# 5. Convention: "start" vs "end" for a YEAR period -# --------------------------------------------------------------------------- - - -def test_asof_convention_start(): - """With as_of='start', a value set mid-year is NOT visible for that year.""" - holder = _make_holder(_AsOfIntVariable) # definition_period=MONTH, as_of='start' - holder.set_input("2024-06", numpy.array([99, 99])) - - # Period "2024-01" starts at 2024-01-01 < 2024-06-01 → None - assert holder.get_array(period("2024-01")) is None - # Period "2024-07" starts at 2024-07-01 > 2024-06-01 → visible - numpy.testing.assert_array_equal( - holder.get_array(period("2024-07")), - [99, 99], - ) - - -def test_asof_convention_end(): - """With as_of='end', a value set mid-year IS visible for that year.""" - holder = _make_holder(_AsOfEndVariable) # definition_period=YEAR, as_of='end' - holder.set_input("2024", numpy.array([42, 42])) - - # Year 2024 ends 2024-12-31; our value is stored with start 2024-01-01 - # which is ≤ 2024-12-31 → visible - numpy.testing.assert_array_equal( - holder.get_array(period("2024")), - [42, 42], - ) - - -# --------------------------------------------------------------------------- -# 6. Regular variable (no as_of) is unaffected -# --------------------------------------------------------------------------- - - -def test_non_asof_variable_unaffected(): - """A variable without as_of still returns None for unstored periods.""" - holder = _make_holder(_RegularVariable) - holder.set_input("2024-01", numpy.array([5, 6])) - - # Exact match works - numpy.testing.assert_array_equal(holder.get_array(period("2024-01")), [5, 6]) - # Other periods return None (no persistence) - assert holder.get_array(period("2024-02")) is None - assert holder.get_array(period("2023-12")) is None - - -# --------------------------------------------------------------------------- -# 7. Patch storage: only the diff is persisted -# --------------------------------------------------------------------------- - - -def test_asof_no_patch_when_value_unchanged(): - """When the new value is identical to the current state, no patch is stored.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([3, 3])) # base - holder.set_input("2024-02", numpy.array([3, 3])) # identical → no patch - - assert ( - len(holder._as_of_patches) == 0 - ), "No patch should be stored for unchanged values" - - -def test_asof_patch_stores_only_changed_indices(): - """A patch stores only the indices and values that actually changed.""" - holder = _make_holder(_AsOfIntVariable, count=3) - holder.set_input("2024-01", numpy.array([1, 2, 3])) # base - holder.set_input("2024-04", numpy.array([1, 9, 3])) # only person 1 changes - - assert len(holder._as_of_patches) == 1 - _, idx, vals = holder._as_of_patches[0] - numpy.testing.assert_array_equal(idx, [1]) - numpy.testing.assert_array_equal(vals, [9]) - - -def test_asof_retroactive_patch(): - """A set_input for a past instant is correctly reflected in all later GETs.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([1, 2])) - holder.set_input("2024-06", numpy.array([1, 9])) - # Retroactively set a change at 2024-03 (before the 2024-06 patch) - holder.set_input("2024-03", numpy.array([5, 2])) - - # Before 2024-03 patch: base only - numpy.testing.assert_array_equal(holder.get_array(period("2024-02")), [1, 2]) - # Between 2024-03 and 2024-06: 2024-03 patch applied - numpy.testing.assert_array_equal(holder.get_array(period("2024-04")), [5, 2]) - # After 2024-06: both patches applied - numpy.testing.assert_array_equal(holder.get_array(period("2024-07")), [5, 9]) - - -# --------------------------------------------------------------------------- -# 8. Snapshot cursor: sequential reads share array objects -# --------------------------------------------------------------------------- - - -def test_asof_snapshot_cursor_no_copy_between_patches(): - """Sequential GETs for periods with no patches between them reuse the same array.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([1, 2])) # base only, no patches - - r_feb = holder.get_array(period("2024-02")) - r_mar = holder.get_array(period("2024-03")) - assert r_feb is r_mar, "No patches between months → same snapshot array reused" - - -# --------------------------------------------------------------------------- -# 9. Stored arrays are read-only (mutation guard) -# --------------------------------------------------------------------------- - - -def test_asof_base_array_is_read_only(): - """The base array stored for an as_of variable must be read-only.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([1, 2])) - - assert not holder._as_of_base.flags.writeable, "Base array should be read-only" - - -def test_asof_get_array_returns_read_only(): - """Arrays returned by get_array for as_of variables must be read-only.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([1, 2])) - holder.set_input("2024-04", numpy.array([3, 2])) # patch - - for month in ("2024-01", "2024-03", "2024-05"): - result = holder.get_array(period(month)) - assert ( - not result.flags.writeable - ), f"Returned array for {month} should be read-only" - - -def test_asof_setting_value_does_not_mutate_caller_array(): - """set_input must not mark the *caller's* array as read-only.""" - holder = _make_holder(_AsOfIntVariable) - caller_arr = numpy.array([10, 20]) - holder.set_input("2024-01", caller_arr) - - # The caller should still be able to write to their array - caller_arr[0] = 99 # must not raise - - -# --------------------------------------------------------------------------- -# 9. Variable declaration validation -# --------------------------------------------------------------------------- - - -def test_as_of_true_normalises_to_start(): - """as_of=True is a documented alias for as_of='start'.""" - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = True - - assert MyVar().as_of == "start" - - -def test_as_of_false_default(): - """Variables without as_of declaration default to as_of=False.""" - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - - assert MyVar().as_of is False - - -def test_as_of_invalid_value_raises(): - """as_of with an invalid value must raise ValueError at instantiation.""" - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "monthly" # invalid - - with pytest.raises(ValueError, match="as_of"): - MyVar() - - -def test_as_of_with_set_input_helper_raises(): - """Combining as_of with a set_input helper is explicitly forbidden.""" - from openfisca_core.holders import set_input_divide_by_period - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - set_input = set_input_divide_by_period - - with pytest.raises(ValueError, match="incompatible"): - MyVar() - - -# --------------------------------------------------------------------------- -# 10. set_input_sparse API -# --------------------------------------------------------------------------- - - -def test_set_input_sparse_basic(): - """set_input_sparse with (idx, vals) produces the same state as set_input.""" - holder = _make_holder(_AsOfIntVariable, count=3) - holder.set_input("2024-01", numpy.array([1, 2, 3])) # base - - # Change person 1 from 2 → 9 - holder.set_input_sparse("2024-04", numpy.array([1]), numpy.array([9])) - - result = holder.get_array(period("2024-04")) - numpy.testing.assert_array_equal(result, [1, 9, 3]) - - -def test_set_input_sparse_empty(): - """An empty idx/vals produces no new patch.""" - holder = _make_holder(_AsOfIntVariable) - holder.set_input("2024-01", numpy.array([1, 2])) - - holder.set_input_sparse( - "2024-02", - numpy.array([], dtype=numpy.int32), - numpy.array([], dtype=numpy.int32), - ) - - assert len(holder._as_of_patches) == 0, "Empty patch should not be stored" - - -def test_set_input_sparse_requires_base(): - """Calling set_input_sparse before set_input raises ValueError.""" - holder = _make_holder(_AsOfIntVariable) - - with pytest.raises(ValueError, match="base"): - holder.set_input_sparse("2024-01", numpy.array([0]), numpy.array([5])) - - -def test_set_input_sparse_non_asof_raises(): - """Calling set_input_sparse on a non-as_of variable raises ValueError.""" - holder = _make_holder(_RegularVariable) - - with pytest.raises(ValueError, match="as_of"): - holder.set_input_sparse("2024-01", numpy.array([0]), numpy.array([5])) - - -def test_set_input_sparse_sequential_snapshot(): - """After sequential set_input_sparse calls the snapshot stays coherent.""" - holder = _make_holder(_AsOfIntVariable, count=4) - holder.set_input("2024-01", numpy.array([1, 2, 3, 4])) # base - - holder.set_input_sparse("2024-02", numpy.array([0]), numpy.array([10])) - holder.set_input_sparse("2024-03", numpy.array([1]), numpy.array([20])) - - # Snapshot should be at 2024-03 after two forward SETs - assert len(holder._as_of_snapshots) > 0 - - result = holder.get_array(period("2024-03")) - numpy.testing.assert_array_equal(result, [10, 20, 3, 4]) - - # Values before 2024-02 are unchanged - numpy.testing.assert_array_equal(holder.get_array(period("2024-01")), [1, 2, 3, 4]) - - -def test_set_input_sparse_vs_set_input_equivalence(): - """GET results are identical whether set_input or set_input_sparse is used.""" - count = 5 - base = numpy.array([1, 2, 3, 4, 5]) - idx = numpy.array([0, 2]) - new_vals = numpy.array([10, 30]) - - # Dense approach - h_dense = _make_holder(_AsOfIntVariable, count=count) - h_dense.set_input("2024-01", base.copy()) - new_full = base.copy() - new_full[idx] = new_vals - h_dense.set_input("2024-04", new_full) - - # Sparse approach - h_sparse = _make_holder(_AsOfIntVariable, count=count) - h_sparse.set_input("2024-01", base.copy()) - h_sparse.set_input_sparse("2024-04", idx, new_vals) - - for month in ("2024-01", "2024-03", "2024-04", "2024-06"): - result_dense = h_dense.get_array(period(month)) - result_sparse = h_sparse.get_array(period(month)) - numpy.testing.assert_array_equal( - result_dense, - result_sparse, - err_msg=f"Dense and sparse results differ for {month}", - ) - - -# --------------------------------------------------------------------------- -# 9. LRU snapshot configuration -# --------------------------------------------------------------------------- - - -def test_snapshot_max_defaults_to_3(): - """_as_of_max_snapshots defaults to 3 when no configuration is provided.""" - holder = _make_holder(_AsOfIntVariable) - assert holder._as_of_max_snapshots == 3 - - -def test_snapshot_max_from_variable_attribute(): - """snapshot_count on the Variable class sets _as_of_max_snapshots.""" - - class _HighSnapshotVariable(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - snapshot_count = 7 - - holder = _make_holder(_HighSnapshotVariable) - assert holder._as_of_max_snapshots == 7 - - -def test_snapshot_max_from_memory_config(): - """MemoryConfig.asof_max_snapshots is used when the variable has no override.""" - holder = _make_holder_with_memory_config(_AsOfIntVariable, asof_max_snapshots=5) - assert holder._as_of_max_snapshots == 5 - - -def test_snapshot_max_variable_overrides_memory_config(): - """Variable.snapshot_count takes priority over MemoryConfig.asof_max_snapshots.""" - - class _OverrideVariable(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - snapshot_count = 2 - - holder = _make_holder_with_memory_config(_OverrideVariable, asof_max_snapshots=9) - assert holder._as_of_max_snapshots == 2 - - -def test_lru_eviction_respects_max_snapshots(): - """The cache never holds more than _as_of_max_snapshots entries.""" - - class _TinyLRUVariable(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - snapshot_count = 2 - - holder = _make_holder(_TinyLRUVariable, count=3) - base = numpy.array([1, 2, 3]) - holder.set_input("2024-01", base) - - # Each SET adds a snapshot; the cache must stay at max 2. - for month in ("2024-02", "2024-03", "2024-04", "2024-05"): - holder.set_input(month, base + 1) - assert len(holder._as_of_snapshots) <= 2 - - -def test_lru_correctness_after_eviction(): - """GET returns correct values even for instants whose snapshot was evicted.""" - - class _SmallLRUVariable(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - snapshot_count = 1 # only ever keep one snapshot - - holder = _make_holder(_SmallLRUVariable, count=2) - holder.set_input("2024-01", numpy.array([1, 10])) - holder.set_input("2024-03", numpy.array([3, 30])) - holder.set_input("2024-06", numpy.array([6, 60])) - - # Snapshot for 2024-01 and 2024-03 were evicted; full reconstruction needed. - numpy.testing.assert_array_equal(holder.get_array(period("2024-02")), [1, 10]) - numpy.testing.assert_array_equal(holder.get_array(period("2024-04")), [3, 30]) - numpy.testing.assert_array_equal(holder.get_array(period("2024-07")), [6, 60]) - - -def test_lru_multi_snapshot_non_linear_access(): - """With max_snapshots=3 both P and P-12 are served without full reconstruction.""" - - class _MultiSnapVariable(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - snapshot_count = 3 - - holder = _make_holder(_MultiSnapVariable, count=2) - holder.set_input("2023-01", numpy.array([0, 0])) - holder.set_input("2024-01", numpy.array([1, 10])) - holder.set_input("2024-02", numpy.array([2, 20])) - holder.set_input("2024-03", numpy.array([3, 30])) - - # Simulate a formula accessing both current and year-ago month. - numpy.testing.assert_array_equal(holder.get_array(period("2024-03")), [3, 30]) - numpy.testing.assert_array_equal( - holder.get_array(period("2023-03")), [0, 0] # before any patch → base - ) - numpy.testing.assert_array_equal(holder.get_array(period("2024-02")), [2, 20]) diff --git a/tests/core/test_group_population_condition.py b/tests/core/test_group_population_condition.py deleted file mode 100644 index 464c44d5c..000000000 --- a/tests/core/test_group_population_condition.py +++ /dev/null @@ -1,179 +0,0 @@ -"""Tests for the condition parameter on GroupPopulation aggregation methods. - -Verifies that sum, any, all, min, max, nb_persons accept a `condition` -keyword argument and produce correct results when combined with `role`. -""" - -from copy import deepcopy - -import pytest - -from openfisca_core import tools -from openfisca_core.simulations import SimulationBuilder - -# -- Fixtures --------------------------------------------------------------- - -TEST_CASE = { - "persons": {"ind0": {}, "ind1": {}, "ind2": {}, "ind3": {}, "ind4": {}, "ind5": {}}, - "households": { - "h1": {"children": ["ind2", "ind3"], "adults": ["ind0", "ind1"]}, - "h2": {"children": ["ind5"], "adults": ["ind4"]}, - }, -} - -TEST_CASE_AGES = deepcopy(TEST_CASE) -AGES = [40, 37, 7, 9, 54, 20] -for _ind, _age in zip(TEST_CASE_AGES["persons"].values(), AGES): - _ind["age"] = _age - -MONTH = "2016-01" -YEAR = 2016 - - -@pytest.fixture -def sim(tax_benefit_system): - test_case = deepcopy(TEST_CASE_AGES) - test_case["persons"]["ind0"]["salary"] = 1000 - test_case["persons"]["ind1"]["salary"] = 1500 - test_case["persons"]["ind2"]["salary"] = 200 - test_case["persons"]["ind3"]["salary"] = 300 - test_case["persons"]["ind4"]["salary"] = 3000 - test_case["persons"]["ind5"]["salary"] = 500 - sb = SimulationBuilder() - sb.set_default_period(MONTH) - return sb.build_from_entities(tax_benefit_system, test_case) - - -# -- Tests ------------------------------------------------------------------- - - -class TestSumWithCondition: - def test_sum_condition_only(self, sim): - household = sim.household - salary = household.members("salary", MONTH) - age = household.members("age", MONTH) - is_adult = age >= 18 - - result = household.sum(salary, condition=is_adult) - # h1: ind0(1000, 40) + ind1(1500, 37) = 2500 (children excluded: ind2 age 7, ind3 age 9) - # h2: ind4(3000, 54) + ind5(500, 20) = 3500 (both adults) - tools.assert_near(result, [2500, 3500]) - - def test_sum_role_and_condition(self, sim): - from openfisca_country_template import entities - - CHILD = entities.Household.CHILD - - household = sim.household - salary = household.members("salary", MONTH) - age = household.members("age", MONTH) - is_older_than_8 = age > 8 - - result = household.sum(salary, role=CHILD, condition=is_older_than_8) - # h1 children: ind2(200, age 7 → excluded), ind3(300, age 9 → included) = 300 - # h2 children: ind5(500, age 20 → included) = 500 - tools.assert_near(result, [300, 500]) - - -class TestAnyWithCondition: - def test_any_condition_only(self, sim): - household = sim.household - salary = household.members("salary", MONTH) - age = household.members("age", MONTH) - is_child = age < 18 - - # Any child with salary > 250? - result = household.any(salary > 250, condition=is_child) - # h1: ind2(200 → no), ind3(300 → yes) → True - # h2: ind5(500 → yes, but age 20 → not a child) → False - tools.assert_near(result, [True, False]) - - -class TestAllWithCondition: - def test_all_condition_only(self, sim): - household = sim.household - salary = household.members("salary", MONTH) - age = household.members("age", MONTH) - is_adult = age >= 18 - - # All adults have salary >= 1000? - result = household.all(salary >= 1000, condition=is_adult) - # h1: ind0(1000 → yes), ind1(1500 → yes) → True - # h2: ind4(3000 → yes), ind5(500 → no) → False - tools.assert_near(result, [True, False]) - - -class TestMinMaxWithCondition: - def test_min_condition_only(self, sim): - household = sim.household - salary = household.members("salary", MONTH) - age = household.members("age", MONTH) - is_adult = age >= 18 - - result = household.min(salary, condition=is_adult) - # h1: min(1000, 1500) = 1000 - # h2: min(3000, 500) = 500 - tools.assert_near(result, [1000, 500]) - - def test_max_condition_only(self, sim): - household = sim.household - salary = household.members("salary", MONTH) - age = household.members("age", MONTH) - is_adult = age >= 18 - - result = household.max(salary, condition=is_adult) - # h1: max(1000, 1500) = 1500 - # h2: max(3000, 500) = 3000 - tools.assert_near(result, [1500, 3000]) - - -class TestNbPersonsWithCondition: - def test_nb_persons_condition_only(self, sim): - household = sim.household - age = household.members("age", MONTH) - is_adult = age >= 18 - - result = household.nb_persons(condition=is_adult) - # h1: ind0(40), ind1(37) → 2 adults - # h2: ind4(54), ind5(20) → 2 adults - tools.assert_near(result, [2, 2]) - - def test_nb_persons_role_and_condition(self, sim): - from openfisca_country_template import entities - - CHILD = entities.Household.CHILD - - household = sim.household - salary = household.members("salary", MONTH) - has_income = salary > 0 - - result = household.nb_persons(role=CHILD, condition=has_income) - # h1 children: ind2(200 → yes), ind3(300 → yes) → 2 - # h2 children: ind5(500 → yes) → 1 - tools.assert_near(result, [2, 1]) - - -class TestBackwardCompatibility: - """Every existing call without condition should still work identically.""" - - def test_sum_without_condition(self, sim): - household = sim.household - salary = household.members("salary", MONTH) - result = household.sum(salary) - tools.assert_near(result, [3000, 3500]) - - def test_min_without_condition(self, sim): - household = sim.household - age = household.members("age", MONTH) - result = household.min(age) - tools.assert_near(result, [7, 20]) - - def test_max_without_condition(self, sim): - household = sim.household - age = household.members("age", MONTH) - result = household.max(age) - tools.assert_near(result, [40, 54]) - - def test_nb_persons_without_condition(self, sim): - household = sim.household - tools.assert_near(household.nb_persons(), [4, 2]) diff --git a/tests/core/test_link_accessors.py b/tests/core/test_link_accessors.py deleted file mode 100644 index 3d4570524..000000000 --- a/tests/core/test_link_accessors.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy -import pytest - -from openfisca_core import entities, periods, taxbenefitsystems, variables -from openfisca_core.links.implicit import ImplicitMany2OneLink, ImplicitOne2ManyLink -from openfisca_core.simulations import SimulationBuilder - - -@pytest.fixture -def simple_sim(): - # two households, variable salaries on persons, roles for persons - person = entities.SingleEntity("person", "persons", "A person", "") - household = entities.GroupEntity( - "household", - "households", - "A household", - "", - roles=[{"key": "parent"}, {"key": "child"}], - ) - - tbs = taxbenefitsystems.TaxBenefitSystem([person, household]) - - class salary(variables.Variable): - value_type = float - entity = person - definition_period = periods.DateUnit.YEAR - - class rent(variables.Variable): - value_type = float - entity = household - definition_period = periods.DateUnit.YEAR - - tbs.add_variable(salary) - tbs.add_variable(rent) - - sim = SimulationBuilder().build_from_dict( - tbs, - { - "persons": { - "p0": {"salary": {"2024": 1000.0}}, - "p1": {"salary": {"2024": 500.0}}, - "p2": {"salary": {"2024": 2000.0}}, - "p3": {"salary": {"2024": 100.0}}, - }, - "households": { - "h0": {"parent": ["p0"], "child": ["p1"], "rent": {"2024": 800.0}}, - "h1": {"parent": ["p2"], "child": [], "rent": {"2024": 500.0}}, - "h2": {"parent": [], "child": ["p3"], "rent": {"2024": 100.0}}, - }, - }, - ) - return sim - - -def test_nth_accessor(simple_sim): - link = ImplicitOne2ManyLink("persons", "household") - link.attach(simple_sim.populations["household"]) - link.resolve(simple_sim.populations) - - # salaries grouped by household: h0->[p0(1000),p1(500)], h1->[p2(2000)], h2->[p3(100)] - first = link.nth(0, "salary", "2024") - assert numpy.array_equal(first, [1000.0, 2000.0, 100.0]) - - second = link.nth(1, "salary", "2024") - # only h0 has second member - assert numpy.array_equal(second, [500.0, 0.0, 0.0]) - - -def test_one2many_get_by_role(simple_sim): - link = ImplicitOne2ManyLink("persons", "household") - link.attach(simple_sim.populations["household"]) - link.resolve(simple_sim.populations) - - rents_parent = link.get_by_role("salary", "2024", role_value="parent") - # salary of parent in each household: h0->1000, h1->2000, h2->0 - assert numpy.array_equal(rents_parent, [1000.0, 2000.0, 0.0]) - - -def test_get_by_role(simple_sim): - # use implicit many2one to fetch household rent per person using role - many = ImplicitMany2OneLink("household") - many.attach(simple_sim.persons) - many.resolve(simple_sim.populations) - - # we expect parent/child roles available on person side - rents_parent = many.get_by_role("rent", "2024", role_value="parent") - # p0 is parent in h0 -> 800, p2 parent in h1 -> 500, others no parent - assert numpy.array_equal(rents_parent, [800.0, 0.0, 500.0, 0.0]) - - rents_child = many.get_by_role("rent", "2024", role_value="child") - # p1 child of h0 -> 800, p3 child h2 ->100 - assert numpy.array_equal(rents_child, [0.0, 800.0, 0.0, 100.0]) diff --git a/tests/core/test_period_id_to_rownum.py b/tests/core/test_period_id_to_rownum.py deleted file mode 100644 index 227835a1c..000000000 --- a/tests/core/test_period_id_to_rownum.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np - -from openfisca_core import entities, periods, taxbenefitsystems -from openfisca_core.entities.entity import Entity -from openfisca_core.populations._core_population import CorePopulation -from openfisca_core.simulations import SimulationBuilder - - -def test_get_period_id_to_rownum_remapping(): - # Setup entity and population - entity = Entity("person", "people", "", "") - pop = CorePopulation(entity) - pop.count = 3 - - # initial mapping: identity (0->0,1->1,2->2) - pop._id_to_rownum = np.array([0, 1, 2], dtype=np.intp) - t0 = periods.period("2010-01") - pop.snapshot_period(t0) - - # change mapping to simulate reordering/new indexing - # now id 0 -> row 2, id 1 -> row 0, id 2 -> row 1 - pop._id_to_rownum = np.array([2, 0, 1], dtype=np.intp) - t1 = periods.period("2010-02") - pop.snapshot_period(t1) - - # ids to remap (as stored in data referring to t0) - ids = np.array([0, 0, 1, 2, 0], dtype=np.intp) - - past_index = pop.get_period_id_to_rownum(t0) - assert past_index is not None - rows_t0 = past_index[ids] - # with identity mapping, rows should equal the ids - assert np.array_equal(rows_t0, np.array([0, 0, 1, 2, 0], dtype=np.intp)) - - # current mapping produces different rows - current_index = pop.get_period_id_to_rownum(t1) - assert current_index is not None - rows_t1 = current_index[ids] - assert np.array_equal(rows_t1, np.array([2, 2, 0, 1, 2], dtype=np.intp)) - - -def _make_tbs(): - """Return a minimal TaxBenefitSystem with one person entity.""" - person = entities.Entity("person", "persons", "", "") - return taxbenefitsystems.TaxBenefitSystem([person]) - - -def _make_group_tbs(): - """Return a TaxBenefitSystem with a person and a group entity.""" - person = entities.SingleEntity("person", "persons", "", "") - household = entities.GroupEntity( - "household", "households", "", "", roles=[{"key": "member"}] - ) - return taxbenefitsystems.TaxBenefitSystem([person, household]) - - -def test_build_default_simulation_sets_id_to_rownum(): - """SimulationBuilder.build_default_simulation populates _id_to_rownum.""" - tbs = _make_tbs() - sim = SimulationBuilder().build_default_simulation(tbs, count=3) - for pop in sim.populations.values(): - assert pop._id_to_rownum is not None - assert np.array_equal(pop._id_to_rownum, np.arange(3, dtype=np.intp)) - - -def test_build_from_dict_sets_id_to_rownum(): - """SimulationBuilder.build_from_dict populates _id_to_rownum.""" - tbs = _make_tbs() - sim = SimulationBuilder().build_from_dict( - tbs, - { - "persons": { - "p0": {}, - "p1": {}, - "p2": {}, - }, - }, - ) - pop = sim.populations["person"] - assert pop._id_to_rownum is not None - assert np.array_equal(pop._id_to_rownum, np.arange(3, dtype=np.intp)) - - -def test_build_default_simulation_empty(): - """count=0 produces an empty identity mapping, not None.""" - tbs = _make_tbs() - sim = SimulationBuilder().build_default_simulation(tbs, count=0) - for pop in sim.populations.values(): - assert pop._id_to_rownum is not None - assert pop._id_to_rownum.shape == (0,) - assert pop._id_to_rownum.dtype == np.intp - - -def test_build_default_simulation_single(): - """count=1 produces a single-element identity mapping.""" - tbs = _make_tbs() - sim = SimulationBuilder().build_default_simulation(tbs, count=1) - for pop in sim.populations.values(): - assert np.array_equal(pop._id_to_rownum, np.array([0], dtype=np.intp)) - - -def test_build_default_simulation_group_tbs_both_populations(): - """Both person and group populations receive _id_to_rownum.""" - tbs = _make_group_tbs() - sim = SimulationBuilder().build_default_simulation(tbs, count=2) - assert np.array_equal(sim.populations["person"]._id_to_rownum, [0, 1]) - assert np.array_equal(sim.populations["household"]._id_to_rownum, [0, 1]) - - -def test_build_from_dict_group_tbs_both_populations(): - """build_from_dict sets _id_to_rownum on both person and group populations.""" - tbs = _make_group_tbs() - sim = SimulationBuilder().build_from_dict( - tbs, - { - "persons": {"p0": {}, "p1": {}, "p2": {}}, - "households": { - "h0": {"member": ["p0", "p1"]}, - "h1": {"member": ["p2"]}, - }, - }, - ) - assert np.array_equal( - sim.populations["person"]._id_to_rownum, np.arange(3, dtype=np.intp) - ) - assert np.array_equal( - sim.populations["household"]._id_to_rownum, np.arange(2, dtype=np.intp) - ) - - -def test_id_to_rownum_dtype(): - """_id_to_rownum always has dtype numpy.intp.""" - tbs = _make_tbs() - sim = SimulationBuilder().build_default_simulation(tbs, count=4) - pop = sim.populations["person"] - assert pop._id_to_rownum.dtype == np.intp - - -def test_id_to_rownum_usable_as_index(): - """_id_to_rownum identity mapping round-trips: rownum[id] == id.""" - tbs = _make_tbs() - count = 5 - sim = SimulationBuilder().build_default_simulation(tbs, count=count) - pop = sim.populations["person"] - ids = np.arange(count, dtype=np.intp) - assert np.array_equal(pop._id_to_rownum[ids], ids) diff --git a/tests/core/test_reforms.py b/tests/core/test_reforms.py index 9674375e8..bc1d1747d 100644 --- a/tests/core/test_reforms.py +++ b/tests/core/test_reforms.py @@ -447,7 +447,7 @@ class wrong_reform(Reform): # A Reform must implement an `apply` method pass - with pytest.raises(Exception, match="must define an `apply`"): + with pytest.raises(Exception): # noqa: B017 wrong_reform(tax_benefit_system) diff --git a/tests/core/test_transition_formula.py b/tests/core/test_transition_formula.py deleted file mode 100644 index 5d9bab057..000000000 --- a/tests/core/test_transition_formula.py +++ /dev/null @@ -1,560 +0,0 @@ -"""Tests for the transition_formula feature. - -transition_formula is an alternative to formula for as_of variables. -Instead of returning a full N-array, it returns (selector, vals) describing -only the individuals that change state — enabling O(k) sparse storage with -no O(N) diff computation. - -Naming convention mirrors formula_YYYY_MM_DD: transition_formula_YYYY_MM_DD. -""" - -from __future__ import annotations - -import numpy -import pytest - -from openfisca_core.entities import Entity -from openfisca_core.periods import DateUnit -from openfisca_core.populations import Population -from openfisca_core.simulations import Simulation -from openfisca_core.taxbenefitsystems import TaxBenefitSystem -from openfisca_core.variables import Variable - -# --------------------------------------------------------------------------- -# Minimal fixtures — no country-template dependency -# --------------------------------------------------------------------------- - -_entity = Entity("person", "persons", "", "") - - -def _make_simulation(*variable_classes, count: int = 3) -> Simulation: - """Build a minimal Simulation with the given variable classes. - - TaxBenefitSystem copies the entity internally, so the Population must use - tbs.person_entity (the copy) to ensure _tax_benefit_system is set. - """ - tbs = TaxBenefitSystem([_entity]) - person_entity = tbs.person_entity # the copy that has _tax_benefit_system set - for vc in variable_classes: - tbs.add_variable(vc) - pop = Population(person_entity) - pop.count = count - pop.ids = [str(i) for i in range(count)] - sim = Simulation(tbs, {person_entity.key: pop}) - return sim - - -# --------------------------------------------------------------------------- -# 1. Variable-level validation — no simulation needed -# --------------------------------------------------------------------------- - - -def test_transition_formula_requires_asof(): - """Declaring transition_formula without as_of must raise at instantiation.""" - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - - def transition_formula(person, period): # noqa: N805 - return numpy.array([False, False, False]), numpy.array([]) - - with pytest.raises(ValueError, match="as_of"): - MyVar() - - -def test_transition_formula_exclusive_with_formula(): - """Declaring both formula and transition_formula must raise at instantiation.""" - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def formula(person, period): # noqa: N805 - return numpy.zeros(3, dtype=numpy.int32) - - def transition_formula(person, period): # noqa: N805 - return numpy.array([False, False, False]), numpy.array([]) - - with pytest.raises(ValueError, match="mutually exclusive"): - MyVar() - - -def test_transition_formula_is_not_input_variable(): - """A variable with transition_formula is not an input variable.""" - - class MyVar(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - return numpy.array([False, False, False]), numpy.array([]) - - assert not MyVar().is_input_variable() - - -def test_has_transition_formula_property(): - """has_transition_formula reflects whether transition_formula* exist.""" - - class WithTransition(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - return numpy.array([False, False, False]), numpy.array([]) - - class WithoutTransition(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - - assert WithTransition().has_transition_formula - assert not WithoutTransition().has_transition_formula - - -# --------------------------------------------------------------------------- -# 2. Date dispatch — transition_formula_YYYY_MM_DD -# --------------------------------------------------------------------------- - - -def test_transition_formula_date_dispatch(): - """transition_formula_2024 replaces transition_formula from 2024 onwards.""" - - class Echelon(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - # Rule before 2024: person 0 gets +1 - return numpy.array([True, False, False]), numpy.array([10]) - - def transition_formula_2024_01_01(person, period): # noqa: N805 - # Rule from 2024: person 1 gets +2 - return numpy.array([False, True, False]), numpy.array([20]) - - var = Echelon() - # get_transition_formula returns the raw unbound function from the SortedDict, - # so compare against the class-level function (not the bound method via var.xxx). - assert var.get_transition_formula("2023-06") is Echelon.transition_formula - assert ( - var.get_transition_formula("2024-01") is Echelon.transition_formula_2024_01_01 - ) - assert ( - var.get_transition_formula("2025-03") is Echelon.transition_formula_2024_01_01 - ) - - -# --------------------------------------------------------------------------- -# 3. Basic execution — simulation.calculate triggers transition_formula -# --------------------------------------------------------------------------- - - -def test_transition_formula_basic(): - """transition_formula result is correctly stored and returned.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - # person 1 transitions to 99 - return numpy.array([False, True, False]), numpy.array([99]) - - sim = _make_simulation(Score) - sim.set_input("Score", "2024-01", numpy.array([1, 2, 3])) - - result = sim.calculate("Score", "2024-02") - numpy.testing.assert_array_equal(result, [1, 99, 3]) - - -def test_transition_formula_none_return_persists_previous(): - """Returning None from transition_formula leaves the previous state intact.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - return None # no change - - sim = _make_simulation(Score) - sim.set_input("Score", "2024-01", numpy.array([10, 20, 30])) - - result = sim.calculate("Score", "2024-03") - numpy.testing.assert_array_equal(result, [10, 20, 30]) - - -def test_transition_formula_scalar_vals(): - """A scalar val is broadcast to all selected individuals.""" - - class Bonus(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - # Everyone gets a flat 500 - return numpy.array([True, True, True]), 500 - - sim = _make_simulation(Bonus) - sim.set_input("Bonus", "2024-01", numpy.array([0, 0, 0])) - - result = sim.calculate("Bonus", "2024-02") - numpy.testing.assert_array_equal(result, [500, 500, 500]) - - -def test_transition_formula_computed_once_per_instant(): - """transition_formula is not called twice for the same instant.""" - call_count = [0] - - class Counter(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - call_count[0] += 1 - return numpy.array([True, False, False]), numpy.array([call_count[0]]) - - sim = _make_simulation(Counter) - sim.set_input("Counter", "2024-01", numpy.array([0, 0, 0])) - - r1 = sim.calculate("Counter", "2024-02") - r2 = sim.calculate("Counter", "2024-02") - - assert call_count[0] == 1, "Formula must be called exactly once per instant" - numpy.testing.assert_array_equal(r1, r2) - - -def test_transition_formula_no_base_raises(): - """If set_input was never called and no initial_formula is defined, raise.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - return numpy.array([True, False, False]), numpy.array([99]) - - sim = _make_simulation(Score) - with pytest.raises(ValueError, match="no initial state"): - sim.calculate("Score", "2024-01") - - -# --------------------------------------------------------------------------- -# 3b. initial_formula -# --------------------------------------------------------------------------- - - -def test_initial_formula_establishes_base(): - """initial_formula is called on first access and establishes the base.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def initial_formula(person, period): # noqa: N805 - return numpy.array([10, 20, 30]) - - def transition_formula(person, period): # noqa: N805 - return numpy.array([False, False, False]), numpy.array([]) - - sim = _make_simulation(Score) - result = sim.calculate("Score", "2024-01") - numpy.testing.assert_array_equal(result, [10, 20, 30]) - - -def test_initial_formula_then_transition(): - """initial_formula seeds the state; transition_formula evolves it.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def initial_formula(person, period): # noqa: N805 - return numpy.array([1, 2, 3]) - - def transition_formula(person, period): # noqa: N805 - # First person gains 10 each month. - return numpy.array([0]), numpy.array( - [person("Score", period.last_month)[0] + 10] - ) - - sim = _make_simulation(Score) - sim.calculate("Score", "2024-01") # seeds: [1, 2, 3] - result = sim.calculate("Score", "2024-02") - numpy.testing.assert_array_equal(result, [11, 2, 3]) - - -def test_initial_formula_requires_as_of(): - """initial_formula without as_of raises at instantiation time.""" - - class Bad(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - - def initial_formula(person, period): # noqa: N805 - return numpy.zeros(3) - - with pytest.raises(ValueError, match="initial_formula.*as_of"): - Bad() - - -def test_initial_formula_date_dispatch(): - """initial_formula_YYYY_MM_DD dispatch works like formula_YYYY_MM_DD.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def initial_formula(person, period): # noqa: N805 - return numpy.array([0, 0, 0]) - - def initial_formula_2025_01_01(person, period): # noqa: N805 - return numpy.array([99, 99, 99]) - - def transition_formula(person, period): # noqa: N805 - return numpy.array([], dtype=numpy.int32), numpy.array([]) - - sim_before = _make_simulation(Score) - sim_before.calculate("Score", "2024-06") - numpy.testing.assert_array_equal( - sim_before.get_array("Score", "2024-06"), [0, 0, 0] - ) - - sim_after = _make_simulation(Score) - sim_after.calculate("Score", "2025-06") - numpy.testing.assert_array_equal( - sim_after.get_array("Score", "2025-06"), [99, 99, 99] - ) - - -# --------------------------------------------------------------------------- -# 4. Date dispatch integration -# --------------------------------------------------------------------------- - - -def test_transition_formula_date_dispatch_integration(): - """The correct dated transition_formula is applied for each period.""" - - class Echelon(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - # Old rule: person 0 gets 10 - return numpy.array([True, False, False]), numpy.array([10]) - - def transition_formula_2024_06_01(person, period): # noqa: N805 - # New rule: person 1 gets 20 - return numpy.array([False, True, False]), numpy.array([20]) - - sim = _make_simulation(Echelon) - sim.set_input("Echelon", "2024-01", numpy.array([0, 0, 0])) - - # Before 2024-06: old rule applies → person 0 → 10 - r_jan = sim.calculate("Echelon", "2024-02") - numpy.testing.assert_array_equal(r_jan, [10, 0, 0]) - - # From 2024-06: new rule applies → person 1 → 20 - r_jun = sim.calculate("Echelon", "2024-06") - numpy.testing.assert_array_equal(r_jun, [10, 20, 0]) - - -# --------------------------------------------------------------------------- -# 5. Mismatch between selector length and vals length → clear error -# --------------------------------------------------------------------------- - - -def test_transition_formula_length_mismatch_raises(): - """Returning mismatched selector/vals lengths must raise ValueError.""" - - class Bad(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - # mask selects 2 but vals has 3 → error - return numpy.array([True, True, False]), numpy.array([1, 2, 3]) - - sim = _make_simulation(Bad) - sim.set_input("Bad", "2024-01", numpy.array([0, 0, 0])) - - with pytest.raises(ValueError, match="2 selected"): - sim.calculate("Bad", "2024-02") - - -# --------------------------------------------------------------------------- -# 5. Tracer — cycle vs spiral -# --------------------------------------------------------------------------- - - -def test_transition_formula_temporal_recursion_not_spiral(): - """Reading the same as_of variable at a previous period must NOT trigger - SpiralError — the recursion terminates via _as_of_transition_computed.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def initial_formula(person, period): # noqa: N805 - return numpy.array([0, 0, 0]) - - def transition_formula(person, period): # noqa: N805 - # reads the same variable one month back — legitimate temporal recursion - prev = person("Score", period.last_month) - return numpy.array([0, 1, 2]), prev[[0, 1, 2]] + 1 - - sim = _make_simulation(Score) - # sequential: init=0 → +1 at 2024-02 → +1 at 2024-03 = [2, 2, 2] - sim.calculate("Score", "2024-01") - sim.calculate("Score", "2024-02") - result = sim.calculate("Score", "2024-03") - numpy.testing.assert_array_equal(result, [2, 2, 2]) - - -def test_transition_formula_true_cycle_raises(): - """A transition_formula that reads the same variable@same period must - raise CycleError (genuine infinite loop).""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - # reads itself at the SAME period → true cycle - person("Score", period) - return numpy.array([], dtype=numpy.int32), numpy.array([]) - - sim = _make_simulation(Score) - sim.set_input("Score", "2024-01", numpy.array([0, 0, 0])) - # CycleError is caught inside _calculate_transition; the call completes - # with the previous state (no patch applied) rather than crashing. - result = sim.calculate("Score", "2024-02") - numpy.testing.assert_array_equal(result, [0, 0, 0]) - - -# --------------------------------------------------------------------------- -# 6. formula_type in trace -# --------------------------------------------------------------------------- - - -def test_formula_type_initial_recorded_in_trace(): - """When initial_formula is called, the trace node has formula_type='initial'.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def initial_formula(person, period): # noqa: N805 - return numpy.array([1, 2, 3]) - - def transition_formula(person, period): # noqa: N805 - return numpy.array([], dtype=numpy.int32), numpy.array([]) - - sim = _make_simulation(Score) - sim.trace = True - sim.calculate("Score", "2024-01") - - nodes = list(sim.tracer.browse_trace()) - score_nodes = [n for n in nodes if n.name == "Score"] - assert len(score_nodes) == 1 - assert score_nodes[0].formula_type == "initial" - - -def test_formula_type_transition_recorded_in_trace(): - """When transition_formula runs, the trace node has formula_type='transition'.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def transition_formula(person, period): # noqa: N805 - return numpy.array([], dtype=numpy.int32), numpy.array([]) - - sim = _make_simulation(Score) - sim.set_input("Score", "2024-01", numpy.array([0, 0, 0])) - sim.trace = True - sim.calculate("Score", "2024-02") - - nodes = list(sim.tracer.browse_trace()) - score_nodes = [n for n in nodes if n.name == "Score"] - assert len(score_nodes) == 1 - assert score_nodes[0].formula_type == "transition" - - -def test_formula_type_none_for_regular_formula(): - """Regular formula nodes have formula_type=None.""" - from openfisca_core.tracers import FullTracer - - tracer = FullTracer() - tracer.record_calculation_start("salary", 2024) - tracer.record_calculation_result(numpy.array([100])) - tracer.record_calculation_end() - - assert tracer.trees[0].formula_type is None - - -def test_computation_log_show_formula_type(): - """show_formula_type=True adds [initial]/[transition] tags in computation log lines.""" - - class Score(Variable): - entity = _entity - definition_period = DateUnit.MONTH - value_type = int - as_of = "start" - - def initial_formula(person, period): # noqa: N805 - return numpy.array([0, 0, 0]) - - def transition_formula(person, period): # noqa: N805 - return numpy.array([], dtype=numpy.int32), numpy.array([]) - - sim = _make_simulation(Score) - sim.trace = True - sim.calculate("Score", "2024-01") # initial - sim.calculate("Score", "2024-02") # transition - - lines_with = sim.tracer.computation_log.lines(show_formula_type=True) - lines_without = sim.tracer.computation_log.lines(show_formula_type=False) - - assert any("[initial]" in line for line in lines_with) - assert any("[transition]" in line for line in lines_with) - assert not any("[initial]" in line for line in lines_without) - assert not any("[transition]" in line for line in lines_without)