diff --git a/examples/cardinal_project.yaml b/examples/cardinal_project.yaml new file mode 100644 index 0000000..5b09534 --- /dev/null +++ b/examples/cardinal_project.yaml @@ -0,0 +1,64 @@ +Project: + + Name: test_cardinal + + # Include other configuration files + Includes: + - examples/cardinal_project_library.yaml + + PathTemplates: {} + + CommonPaths: + root: /global/cfs/cdirs/lsst/groups/PZ/Cardinal + scratch_root: "{root}" + catalogs_dir: "{root}/parquet_files" + project: test_cardinal + sim_version: v1 + + # Baseline configuraiton, included in others by default + Baseline: + catalog_tag: cardinal + pipelines: ['all'] + file_aliases: # Set the training and test files + test: test_file_100k + train: train_file_100k + train_zCOSMOS: train_file_zCOSMOS_100k + wide: wide_file_full + deep: deep_file_full + spec: spec_file_full + + # These define the variant configurations for the various parts of the analysis + Flavors: + - Flavor: + name: train_cosmos + pipelines: ['pz', 'tomography'] + file_aliases: # Set the training and test files + test: test_file_100k + train: train_file_zCOSMOS_100k + - Flavor: + name: gpz_gl + pipelines: ['pz'] # only run the pz pipeline + pipeline_overrides: # Override specifics for particular pipelines + default: + kwargs: + algorithms: ['gpz'] # Only run gpz + inform: + inform_gpz: + gpz_method: GL + + # These are variables that we iterate over when running over entire catalogs + IterationVars: + healpix: + - 427 + - 428 + - 429 + - 430 + - 431 + - 432 + - 73 + - 122 + - 99 + - 16 + - 58 + - 277 + - 346 diff --git a/examples/cardinal_project_library.yaml b/examples/cardinal_project_library.yaml new file mode 100644 index 0000000..bb2f34c --- /dev/null +++ b/examples/cardinal_project_library.yaml @@ -0,0 +1,292 @@ +# This is the "library" of common defintions shared between similar projects + +# Templates we use to define catalogs +# +# A CatalogTemplate can be given several interpolants to resolve to a specific Catalog +# +# Some of the interpolants [{catalogs_dir}, {project}, {sim_version}] +# are defined in the Project CommonPaths block +# +# Possibles values of {selection} are defined the the library "Selections" block +# +# Possible values of {flavor} are defined the the Project "Flavors" block +Catalogs: + # The input "Truth" catalog + - CatalogTemplate: + name: truth + path_template: "{catalogs_dir}/Chinchilla-3-triofile.{healpix}.parquet" + iteration_vars: ['healpix'] + # Catalogs that have been reduced using a particular selection + - CatalogTemplate: + name: reduced + path_template: "{catalogs_dir}/{project}_{selection}/{healpix}/Chinchilla-3-triofile.pq" + iteration_vars: ['healpix'] + # Data output from running some degraders on the original catalog + # Note that the {basename} can be used to select which degraded output we want + - CatalogTemplate: + name: degraded + path_template: "{catalogs_dir}/{project}_{selection}_{flavor}/{healpix}/{basename}" + iteration_vars: ['healpix'] + + +# This file describes the templates we use to define files and specfic instances +# of files +# +# A FileTemplate can be given several interpolants to resolve to a specific FileInstance +# +# Some of the interpolants [{catalogs_dir}, {project}] +# are defined in the Project CommonPaths block +# +# Possibles values of {selection} are defined the the library "Selections" block +Files: + # Testing data file + - FileTemplate: + name: test_file_100k + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_100k.hdf5" + # Generic training data file + - FileTemplate: + name: train_file_100k + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_100k.hdf5" + - FileTemplate: + name: train_file_200k + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_200k.hdf5" + - FileTemplate: + name: train_file_10 + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_10.hdf5" + - FileTemplate: + name: test_split_file + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_split_test_10.hdf5" + - FileTemplate: + name: train_split_file + path_template: "{catalogs_dir}/train/{project}_{selection}_baseline_split_train_10.hdf5" + - FileTemplate: + name: deep_file_full + path_template: "{catalogs_dir}/som/{project}_deep_data_full.hdf5" + - FileTemplate: + name: deep_file_small + path_template: "{catalogs_dir}/som/{project}_deep_data_small.hdf5" + - FileTemplate: + name: wide_file_full + path_template: "{catalogs_dir}/som/{project}_wide_data_full.hdf5" + - FileTemplate: + name: wide_file_small + path_template: "{catalogs_dir}/som/{project}_wide_data_small.hdf5" + - FileTemplate: + name: spec_file_full + path_template: "{catalogs_dir}/som/{project}_spec_data_full.hdf5" + - FileTemplate: + name: spec_file_small + path_template: "{catalogs_dir}/som/{project}_spec_data_small.hdf5" + + +# The next plots list all available algorithms of various types and assigns short names each of then +# + +# These describe all the algorithms that emulate spectroscopic selections +SpecSelections: + - SpecSelection: + name: zCOSMOS + Select: SpecSelection_zCOSMOS + Module: rail.creation.degraders.spectroscopic_selections + + +# These describe all the algorithms that estimate PZ +PZAlgorithms: + - PZAlgorithm: + name: trainz + Estimate: TrainZEstimator + Inform: TrainZInformer + Module: rail.estimation.algos.train_z + - PZAlgorithm: + name: simplenn + Estimate: SklNeurNetEstimator + Inform: SklNeurNetInformer + Module: rail.estimation.algos.sklearn_neurnet + - PZAlgorithm: + name: fzboost + Estimate: FlexZBoostEstimator + Inform: FlexZBoostInformer + Module: rail.estimation.algos.flexzboost + - PZAlgorithm: + name: knn + Estimate: KNearNeighEstimator + Inform: KNearNeighInformer + Module: rail.estimation.algos.k_nearneigh + - PZAlgorithm: + name: gpz + Estimate: GPzEstimator + Inform: GPzInformer + Module: rail.estimation.algos.gpz + + +# These describe all the algorithms that classify objects into tomographic bins +Classifiers: + - Classifier: + name: equal_count + Classify: EqualCountClassifier + Module: rail.estimation.algos.equal_count + - Classifier: + name: uniform_binning + Classify: UniformBinningClassifier + Module: rail.estimation.algos.uniform_binning + + +# These describe the error models we use in the truth_to_observed pipeline +ErrorModels: + - ErrorModel: + name: lsst + ErrorModel: LSSTErrorModel + Module: rail.creation.degraders.photometric_errors + - ErrorModel: + name: roman + ErrorModel: RomanErrorModel + Module: rail.creation.degraders.photometric_errors + + +# These describe the ways we can sub-sample the data +Subsamplers: + - Subsampler: + name: random_subsampler + Subsample: RandomSubsampler + Module: rail.projects.subsampler + - Subsampler: + name: multi_catalog_subsampler + Subsample: MultiCatalogSubsample + Module: rail.projects.subsampler + + +# These describe the ways we can reduce the data +Reducers: + - Reducer: + name: cardinal + Reduce: CardinalReducer + Module: rail.projects.reducer + + +# These describe the various data analysis pipelines +Pipelines: + - PipelineTemplate: + name: truth_to_observed + pipeline_class: rail.pipelines.degradation.truth_to_observed.TruthToObservedPipeline + input_catalog_template: reduced + output_catalog_template: degraded + kwargs: + error_models: ['all'] + selectors: ['all'] + blending: true + - PipelineTemplate: + name: prepare + pipeline_class: rail.pipelines.utils.prepare_observed.PrepareObservedPipeline + input_catalog_template: reduced + output_catalog_template: degraded + - PipelineTemplate: + name: photometric_errors + pipeline_class: rail.pipelines.degradation.apply_phot_errors.ApplyPhotErrorsPipeline + input_catalog_template: reduced + output_catalog_template: degraded + kwargs: + error_models: ['all'] + - PipelineTemplate: + name: spec_selection + input_catalog_template: degraded + output_catalog_template: degraded + input_catalog_basename: output_dereddener_errors.pq + pipeline_class: rail.pipelines.degradation.spectroscopic_selection_pipeline.SpectroscopicSelectionPipeline + kwargs: + selectors: ['all'] + - PipelineTemplate: + name: inform + pipeline_class: rail.pipelines.estimation.inform_all.InformPipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + input: + flavor: baseline + tag: train + kwargs: + algorithms: ['all'] + - PipelineTemplate: + name: estimate + pipeline_class: rail.pipelines.estimation.estimate_all.EstimatePipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + input: + flavor: baseline + tag: test + kwargs: + algorithms: ['all'] + - PipelineTemplate: + name: evaluate + pipeline_class: rail.pipelines.evaluation.evaluate_all.EvaluationPipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + truth: + flavor: baseline + tag: test + kwargs: + algorithms: ['all'] + - PipelineTemplate: + name: pz + pipeline_class: rail.pipelines.estimation.pz_all.PzPipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + input_train: + flavor: baseline + tag: train + input_test: + flavor: baseline + tag: test + kwargs: + algorithms: ['all'] +# These describe the selections going from "truth" to "reduced" catalog + +# These just all use different limiting i-band magnitudes +Selections: + - Selection: + name: maglim_25.5 + cuts: + maglim_i: [null, 25.5] + - Selection: + name: gold + cuts: + maglim_i: [null, 25.5] + - Selection: + name: blend + cuts: + maglim_i: [null, 26.0] + - Selection: + name: crap + cuts: + maglim_i: [null, 30.0] + - Selection: + name: all + cuts: + maglim_i: [null, null] + +# These describe the subsampling used to make test and training datasets +Subsamples: + # Testing subsample with 100 events + - Subsample: + name: test_100k + seed: 1234 + num_objects: 100000 + # Training subsample with 100k events, and a different seed to select the events + - Subsample: + name: train_100k + seed: 4321 + num_objects: 100000 + # Training subsample with 100k events, and a different seed to select the events + - Subsample: + name: train_200k + seed: 5555 + num_objects: 200000 + +# These describe the ways we can split the data +Splitters: + - Splitter: + name: random_splitter + Split: RandomSplitter + Module: rail.projects.splitter diff --git a/examples/rail_project_example.ipynb b/examples/rail_project_example.ipynb index e644d4b..c5ae8cb 100644 --- a/examples/rail_project_example.ipynb +++ b/examples/rail_project_example.ipynb @@ -215,9 +215,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "testenv", "language": "python", - "name": "python3" + "name": "testenv" }, "language_info": { "codemirror_mode": { @@ -229,7 +229,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.12" } }, "nbformat": 4, diff --git a/examples/rail_project_example_Cardinal.ipynb b/examples/rail_project_example_Cardinal.ipynb new file mode 100644 index 0000000..74ad574 --- /dev/null +++ b/examples/rail_project_example_Cardinal.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3f31a160-8ce9-4fe6-8805-9df7a97b7adc", + "metadata": {}, + "source": [ + "## Using `RailProject` \n", + "\n", + "This notebook will show you the basics using the `RailProject` class to manage an analysis project" + ] + }, + { + "cell_type": "markdown", + "id": "d436cc4a-49c9-4f27-93cf-9a196d251a77", + "metadata": {}, + "source": [ + "### Setup and teardown scripts to setup a test area" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96bee599-4711-4a07-b182-1cbe257ea4d8", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "#from rail.projects import library\n", + "#\n", + "check_dir = os.path.basename(os.path.abspath(os.curdir))\n", + "if check_dir == 'examples':\n", + " os.chdir('..')\n", + "\n", + "#setup = library.setup_project_area()\n", + "#assert setup == 0\n", + "\n", + "# use this to cleanup\n", + "# library.teardown_project_area()" + ] + }, + { + "cell_type": "markdown", + "id": "454c1464-aa61-4a0c-96b9-f48f5ddb94f6", + "metadata": {}, + "source": [ + "### Load the test project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7136e23a-43f1-4a36-ba77-03b9a649c449", + "metadata": {}, + "outputs": [], + "source": [ + "from rail.projects import RailProject\n", + "\n", + "project = RailProject.load_config(\"./examples/cardinal_project.yaml\")" + ] + }, + { + "cell_type": "markdown", + "id": "57c75f4c-23f4-415b-8148-5b39754b5770", + "metadata": {}, + "source": [ + "### Inspect the test project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f47f06c0-2718-4cdd-aaa9-156341336907", + "metadata": {}, + "outputs": [], + "source": [ + "catalog_files_truth = project.get_catalog_files(\"truth\")\n", + "print(catalog_files_truth)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9629079a-b8f1-418b-b7ba-46f3d0c8830a", + "metadata": {}, + "outputs": [], + "source": [ + "list(catalog_files_truth)" + ] + }, + { + "cell_type": "markdown", + "id": "d37d65cf-6fbe-4fcd-818f-6eaa6a0aa88d", + "metadata": {}, + "source": [ + "### Run a data reduction algorithm on the test project data\n", + "\n", + "This will use the \"roman_rubin\" reducer to apply the \"gold\" selection to the \"truth\" catalog to make a \"reduced\" catalog" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4a8135-8131-473b-a132-b9be2109dbe0", + "metadata": {}, + "outputs": [], + "source": [ + "project.reduce_data(\n", + " catalog_template=\"truth\",\n", + " output_catalog_template=\"reduced\",\n", + " reducer_class_name=\"cardinal\",\n", + " input_selection=\"\",\n", + " selection=\"gold\",\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "a70ff5d0-2e93-47e9-b5d4-7d791e883d9f", + "metadata": {}, + "source": [ + "### Subsample the test project\n", + "\n", + "This will use the \"random_subsampler\" to apply the \"train_10\" subsample to the \"reduced\" catalog of the baseline flavor with the gold selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6b79133-b695-433c-a47f-91bfa9fffc97", + "metadata": {}, + "outputs": [], + "source": [ + "project.subsample_data(\n", + " catalog_template=\"reduced\",\n", + " file_template=\"train_file_200k\",\n", + " subsampler_class_name=\"random_subsampler\",\n", + " subsample_name=\"train_200k\",\n", + " flavor=\"baseline\",\n", + " selection=\"gold\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a07e4fbd-a75b-48f7-8c45-d249e39d2bf5", + "metadata": {}, + "outputs": [], + "source": [ + "import tables_io" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95396640-4d02-41e0-a396-b118fc84035f", + "metadata": {}, + "outputs": [], + "source": [ + "goldfile = \"/global/cfs/cdirs/lsst/groups/PZ/Cardinal/parquet_files/test/test_cardinal_gold_baseline_200k.parquet\"\n", + "goldtab = tables_io.read(goldfile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b83bdc4e-3652-4adc-a110-094b21aa9ef3", + "metadata": {}, + "outputs": [], + "source": [ + "golddf = tables_io.convert(goldtab, tables_io.types.PD_DATAFRAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8680e7c9-e739-4793-9e32-a9c7b522f3c0", + "metadata": {}, + "outputs": [], + "source": [ + "golddf.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61cb57b0-592a-41a2-9e0a-1bf95e6ae08f", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55acb027-356c-4542-9242-f2bd6d689102", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(golddf['ra'], golddf['dec'], s=1, c='k')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "261d2f64-586a-42ee-a6fb-74870c91b4d3", + "metadata": {}, + "outputs": [], + "source": [ + "gr = golddf['mag_g_lsst'] - golddf['mag_r_lsst']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e505542-9168-4e12-bbf2-ff304e11528a", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,5))\n", + "plt.scatter(golddf['redshift'], gr, s=1,c='k')\n", + "plt.xlabel(\"redshift\")\n", + "plt.ylabel(\"g-r\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dc73824-0698-47c5-8928-c282ba474f7c", + "metadata": {}, + "outputs": [], + "source": [ + "# difference of cosmological redshift (w proper motion) - true z vs true redshift\n", + "delz = golddf['redshift'] - golddf['true_redshift']\n", + "plt.figure(figsize=(14,8))\n", + "plt.scatter(golddf['redshift'], delz, s=.1, c='k')\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"z cosmological - z true\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25199fe0-e845-4f48-8ff5-ae28af2ca67a", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the redshift distribution of the \"gold\" i<25.5 sample:\n", + "import numpy as np\n", + "plt.figure(figsize=(10,6))\n", + "plt.hist(golddf['redshift'], bins=np.linspace(0,2.5,101), color='k');\n", + "plt.xlabel(\"redshift\")\n", + "plt.ylabel(\"Number\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e26824a9-0dae-4192-a790-449e5c0eb3f7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "testenv", + "language": "python", + "name": "testenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/rail/projects/library.py b/src/rail/projects/library.py index 4ab108d..140f7e3 100644 --- a/src/rail/projects/library.py +++ b/src/rail/projects/library.py @@ -236,6 +236,7 @@ def setup_project_area() -> int: # pragma: no cover if not os.path.exists("tests/ci_test.tgz"): urllib.request.urlretrieve( + # "https://portal.nersc.gov/cfs/lsst/PZ/test_data/ci_test.tgz", "http://s3df.slac.stanford.edu/people/echarles/xfer/ci_test.tgz", "tests/ci_test.tgz", ) diff --git a/src/rail/projects/reducer.py b/src/rail/projects/reducer.py index fb2cd19..70fb8a5 100644 --- a/src/rail/projects/reducer.py +++ b/src/rail/projects/reducer.py @@ -65,27 +65,57 @@ COLUMNS_FLAGSHIP = [ "galaxy_id", - "ra_mag_gal", # observed galaxy ra/dec with lensing displacement field applied [degrees] + "ra_mag_gal", # observed galaxy ra/dec with lensing displacement field applied [degrees] "dec_mag_gal", - "lsst_u_el_model3_ext", # observed flux from the continuum + emission including internal attenuation in LSST bands + "lsst_u_el_model3_ext", # observed flux from the continuum + emission including internal attenuation in LSST bands "lsst_g_el_model3_ext", "lsst_r_el_model3_ext", "lsst_i_el_model3_ext", "lsst_z_el_model3_ext", "lsst_y_el_model3_ext", - "euclid_nisp_h_el_model3_ext", # euclid bands (noiseless) + "euclid_nisp_h_el_model3_ext", # euclid bands (noiseless) "euclid_nisp_j_el_model3_ext", "euclid_nisp_y_el_model3_ext", "euclid_vis_el_model3_ext", - "bulge_r50", # half light radius of the bulge [arcsec] - "disk_r50", # half light radius of the disk for an exponential profile (or Sersic profile with index n=1); disk_r50 = disk_scalelength * 1.678 [arcsec] - "bulge_fraction", # ratio of the flux in the bulge component to the total flux (often written B/T) - "gamma1", # shape contribution from lensing, not large but added for consistency + "bulge_r50", # half light radius of the bulge [arcsec] + "disk_r50", # half light radius of the disk for an exponential profile (or Sersic profile with index n=1); disk_r50 = disk_scalelength * 1.678 [arcsec] + "bulge_fraction", # ratio of the flux in the bulge component to the total flux (often written B/T) + "gamma1", # shape contribution from lensing, not large but added for consistency "gamma2", - "eps1_gal", # intrinsic galaxy ellipticity component + "eps1_gal", # intrinsic galaxy ellipticity component "eps2_gal", ] +COLUMNS_CARDINAL = [ + "galaxy_id", + "ra", + "dec", + "sedid", + "size", + "Ellipticity_1", + "Ellipticity_2", + "mag_u_lsst", + "mag_g_lsst", + "mag_r_lsst", + "mag_i_lsst", + "mag_z_lsst", + "mag_y_lsst", + "Roman_Y106", + "Roman_J129", + "Roman_H158", + "Roman_F184", + "Roman_K213", + "WISE_W1", + "WISE_W2", + "redshift", + "true_redshift", + "t_true_redshift", + "Euclid_Y", + "Euclid_J", + "Euclid_H", + "Euclid_redshift" +] + PROJECTIONS_COM_CAM = [ { "ref_flux": pc.field("i_cModelFlux"), @@ -99,6 +129,16 @@ } ] +PROJECTIONS_CARDINAL = [ + { + # "Roman_K213": pc.field("k213"), + "Ellipticity1": pc.field("Ellipticity_1"), + "Ellipticity2": pc.field("Ellipticity_2"), + # "galaxy_id": pc.field("id") + } +] + + PROJECTIONS = [ { "mag_u_lsst": pc.field("LSST_obs_u"), @@ -188,7 +228,7 @@ ), ), "_orientationAngle": pc.atan2( - pc.add(pc.field("eps2_gal"), pc.field("gamma2")), + pc.add(pc.field("eps2_gal"), pc.field("gamma2")), pc.add(pc.field("eps1_gal"), pc.field("gamma1")) ), }, @@ -196,16 +236,16 @@ "major": pc.divide( pc.field("totalHalfLightRadiusArcsec"), pc.sqrt( - pc.sqrt(pc.add(pc.power(pc.add(pc.field("eps1_gal"), pc.field("gamma1")), 2), + pc.sqrt(pc.add(pc.power(pc.add(pc.field("eps1_gal"), pc.field("gamma1")), 2), pc.power(pc.add(pc.field("eps2_gal"), pc.field("gamma2")), 2))) - ), + ), ), "minor": pc.multiply( pc.field("totalHalfLightRadiusArcsec"), pc.sqrt( - pc.sqrt(pc.add(pc.power(pc.add(pc.field("eps1_gal"), pc.field("gamma1")), 2), + pc.sqrt(pc.add(pc.power(pc.add(pc.field("eps1_gal"), pc.field("gamma1")), 2), pc.power(pc.add(pc.field("eps2_gal"), pc.field("gamma2")), 2))) - ), + ), ), "orientationAngle": pc.multiply( pc.scalar(0.5), @@ -350,6 +390,93 @@ def run( pq.write_table(table, output_catalog) +class CardinalReducer(RailReducer): + """Class to reduce the 'Cardinal' simulation input files for pz analysis + Note that cardinal native files are fits files split into triplets, a + preprocessing stage was performed to put them into pyarrow parquet + """ + + config_options: dict[str, StageParameter] = dict( + name=StageParameter(str, None, fmt="%s", required=True, msg="Reducer Name"), + cuts=StageParameter(dict, {}, fmt="%s", msg="Selections"), + ) + + def run( + self, + input_catalog: str, + output_catalog: str, + ) -> None: + # Try to do this right + try: + parsed_filter = parse_item(self.config.cuts) + predicate = pq.filters_to_expression(parsed_filter) + except Exception as msg: + # Fallback to old way. FIXME, deprecate this + if self.config.cuts: + if "maglim_i" in self.config.cuts: + predicate = pc.field("mag_i_lsst") < self.config.cuts["maglim_i"][1] + elif "maglim_Y" in self.config.cuts: + predicate = ( + pc.field("Roman_Y106") < self.config.cuts["maglim_Y"][1] + ) + else: + raise ValueError("No valid cut") from msg + else: # pragma: no cover + predicate = None + + dataset = ds.dataset( + input_catalog, + format="parquet", + ) + + scan_node = acero.Declaration( + "scan", + acero.ScanNodeOptions( + dataset, + columns=COLUMNS_CARDINAL, + filter=predicate, + ), + ) + + filter_node = acero.Declaration( + "filter", + acero.FilterNodeOptions( + predicate, + ), + ) + + column_projection = {k: pc.field(k) for k in COLUMNS_CARDINAL} + projection = column_projection + project_nodes = [] + for _projection in PROJECTIONS_CARDINAL: + for k, v in _projection.items(): + projection[k] = v + project_node = acero.Declaration( + "project", + acero.ProjectNodeOptions( + [v for k, v in projection.items()], + names=[k for k, v in projection.items()], + ), + ) + project_nodes.append(project_node) + + seq = [ + scan_node, + filter_node, + *project_nodes, + ] + plan = acero.Declaration.from_sequence(seq) + + # batches = plan.to_reader(use_threads=True) + table = plan.to_table(use_threads=True) + print(f"writing dataset to {output_catalog}") + + output_dir = os.path.dirname(output_catalog) + + os.makedirs(output_dir, exist_ok=True) + pq.write_table(table, output_catalog) + + class FlagshipReducer(RailReducer): """Class to reduce the 'flagship' simulation input files for pz analysis""" @@ -368,7 +495,7 @@ def run( parsed_filter = parse_item(self.config.cuts) predicate = pq.filters_to_expression(parsed_filter) except Exception as msg: - # Fallback to old way. FIXME, deprecate this + # Fallback to old way. FIXME, deprecate this if self.config.cuts: if "maglim_i" in self.config.cuts: predicate = pc.subtract(pc.multiply(pc.scalar(-2.5), pc.log10(pc.field("lsst_i_el_model3_ext"))), pc.scalar(48.6)) < self.config.cuts["maglim_i"][1] diff --git a/tests/ci_algorithms.yaml b/tests/ci_algorithms.yaml index 4cac99f..db5f3f4 100644 --- a/tests/ci_algorithms.yaml +++ b/tests/ci_algorithms.yaml @@ -120,3 +120,8 @@ Reducers: name: flagship Reduce: FlagshipReducer Module: rail.projects.reducer + + - Reducer: + name: cardinal + Reduce: CardinalReducer + Module: rail.projects.reducer diff --git a/tests/ci_cardinal_project.yaml b/tests/ci_cardinal_project.yaml new file mode 100644 index 0000000..046ddf2 --- /dev/null +++ b/tests/ci_cardinal_project.yaml @@ -0,0 +1,49 @@ +Project: + + Name: ci_test + + # Include other configuration files + Includes: + - tests/ci_cardinal_project_library.yaml + + PathTemplates: {} + + CommonPaths: + root: tests/temp_data + scratch_root: "{root}" + catalogs_dir: "{root}/data/ci_test_v1.1.3" + project: ci_test + sim_version: v1.1.3 + + # Baseline configuraiton, included in others by default + Baseline: + catalog_tag: roman_rubin + pipelines: ['all'] + file_aliases: # Set the training and test files + test: test_100 + train: train_100 + + # These define the variant configurations for the various parts of the analysis + Flavors: + - Flavor: + name: train_cosmos + pipelines: ['pz', 'tomography'] + file_aliases: # Set the training and test files + test: test_100 + train: train_100 + - Flavor: + name: gpz_gl + pipelines: ['pz'] # only run the pz pipeline + pipeline_overrides: # Override specifics for particular pipelines + default: + kwargs: + algorithms: ['gpz'] # Only run gpz + inform: + inform_gpz: + gpz_method: GL + + # These are variables that we iterate over when running over entire catalogs + IterationVars: + healpix: + - 122 + - 58 diff --git a/tests/ci_cardinal_project_library.yaml b/tests/ci_cardinal_project_library.yaml new file mode 100644 index 0000000..dd70c89 --- /dev/null +++ b/tests/ci_cardinal_project_library.yaml @@ -0,0 +1,297 @@ +# This is the "library" of common defintions shared between similar projects + +# Templates we use to define catalogs +# +# A CatalogTemplate can be given several interpolants to resolve to a specific Catalog +# +# Some of the interpolants [{catalogs_dir}, {project}, {sim_version}] +# are defined in the Project CommonPaths block +# +# Possibles values of {selection} are defined the the library "Selections" block +# +# Possible values of {flavor} are defined the the Project "Flavors" block +Catalogs: + # The input "Truth" catalog + - CatalogTemplate: + name: truth + path_template: "{catalogs_dir}/{healpix}/cardinal_test-0.parquet" + iteration_vars: ['healpix'] + # Catalogs that have been reduced using a particular selection + - CatalogTemplate: + name: reduced + path_template: "{catalogs_dir}/{project}_{selection}/{healpix}/cardinal_test_file-0.pq" + iteration_vars: ['healpix'] + # Data output from running some degraders on the original catalog + # Note that the {basename} can be used to select which degraded output we want + - CatalogTemplate: + name: degraded + path_template: "{catalogs_dir}/{project}_{selection}_{flavor}/{healpix}/{basename}" + iteration_vars: ['healpix'] + + +# This file describes the templates we use to define files and specfic instances +# of files +# +# A FileTemplate can be given several interpolants to resolve to a specific FileInstance +# +# Some of the interpolants [{catalogs_dir}, {project}] +# are defined in the Project CommonPaths block +# +# Possibles values of {selection} are defined the the library "Selections" block +Files: + # Testing data file + - FileTemplate: + name: test_100 + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_100k.hdf5" + # Generic training data file + - FileTemplate: + name: train_100 + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_100k.hdf5" + - FileTemplate: + name: train_file_02k + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_200k.hdf5" +# Training file with a non-representative sample from the zCOSMOS selection + - FileTemplate: + name: train_file_zCOSMOS_01k + path_template: "{catalogs_dir}/train/{project}_{selection}_zCOSMOS_100k.hdf5" + # Small trainng file + - FileTemplate: + name: train_file_10 + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_10.hdf5" + - FileTemplate: + name: test_split_file + path_template: "{catalogs_dir}/test/{project}_{selection}_baseline_split_test_10.hdf5" + - FileTemplate: + name: train_split_file + path_template: "{catalogs_dir}/train/{project}_{selection}_baseline_split_train_10.hdf5" + - FileTemplate: + name: deep_file_full + path_template: "{catalogs_dir}/som/{project}_deep_data_full.hdf5" + - FileTemplate: + name: deep_file_small + path_template: "{catalogs_dir}/som/{project}_deep_data_small.hdf5" + - FileTemplate: + name: wide_file_full + path_template: "{catalogs_dir}/som/{project}_wide_data_full.hdf5" + - FileTemplate: + name: wide_file_small + path_template: "{catalogs_dir}/som/{project}_wide_data_small.hdf5" + - FileTemplate: + name: spec_file_full + path_template: "{catalogs_dir}/som/{project}_spec_data_full.hdf5" + - FileTemplate: + name: spec_file_small + path_template: "{catalogs_dir}/som/{project}_spec_data_small.hdf5" + + +# The next plots list all available algorithms of various types and assigns short names each of then +# + +# These describe all the algorithms that emulate spectroscopic selections +SpecSelections: + - SpecSelection: + name: zCOSMOS + Select: SpecSelection_zCOSMOS + Module: rail.creation.degraders.spectroscopic_selections + + +# These describe all the algorithms that estimate PZ +PZAlgorithms: + - PZAlgorithm: + name: trainz + Estimate: TrainZEstimator + Inform: TrainZInformer + Module: rail.estimation.algos.train_z + - PZAlgorithm: + name: simplenn + Estimate: SklNeurNetEstimator + Inform: SklNeurNetInformer + Module: rail.estimation.algos.sklearn_neurnet + - PZAlgorithm: + name: fzboost + Estimate: FlexZBoostEstimator + Inform: FlexZBoostInformer + Module: rail.estimation.algos.flexzboost + - PZAlgorithm: + name: knn + Estimate: KNearNeighEstimator + Inform: KNearNeighInformer + Module: rail.estimation.algos.k_nearneigh + - PZAlgorithm: + name: gpz + Estimate: GPzEstimator + Inform: GPzInformer + Module: rail.estimation.algos.gpz + + +# These describe all the algorithms that classify objects into tomographic bins +Classifiers: + - Classifier: + name: equal_count + Classify: EqualCountClassifier + Module: rail.estimation.algos.equal_count + - Classifier: + name: uniform_binning + Classify: UniformBinningClassifier + Module: rail.estimation.algos.uniform_binning + + +# These describe the error models we use in the truth_to_observed pipeline +ErrorModels: + - ErrorModel: + name: lsst + ErrorModel: LSSTErrorModel + Module: rail.creation.degraders.photometric_errors + - ErrorModel: + name: roman + ErrorModel: RomanErrorModel + Module: rail.creation.degraders.photometric_errors + + +# These describe the ways we can sub-sample the data +Subsamplers: + - Subsampler: + name: random_subsampler + Subsample: RandomSubsampler + Module: rail.projects.subsampler + - Subsampler: + name: multi_catalog_subsampler + Subsample: MultiCatalogSubsample + Module: rail.projects.subsampler + + +# These describe the ways we can reduce the data +Reducers: + - Reducer: + name: cardinal + Reduce: CardinalReducer + Module: rail.projects.reducer + + +# These describe the various data analysis pipelines +Pipelines: + - PipelineTemplate: + name: truth_to_observed + pipeline_class: rail.pipelines.degradation.truth_to_observed.TruthToObservedPipeline + input_catalog_template: reduced + output_catalog_template: degraded + kwargs: + error_models: ['all'] + selectors: ['all'] + blending: true + - PipelineTemplate: + name: prepare + pipeline_class: rail.pipelines.utils.prepare_observed.PrepareObservedPipeline + input_catalog_template: reduced + output_catalog_template: degraded + - PipelineTemplate: + name: photometric_errors + pipeline_class: rail.pipelines.degradation.apply_phot_errors.ApplyPhotErrorsPipeline + input_catalog_template: reduced + output_catalog_template: degraded + kwargs: + error_models: ['all'] + - PipelineTemplate: + name: spec_selection + input_catalog_template: degraded + output_catalog_template: degraded + input_catalog_basename: output_dereddener_errors.pq + pipeline_class: rail.pipelines.degradation.spectroscopic_selection_pipeline.SpectroscopicSelectionPipeline + kwargs: + selectors: ['all'] + - PipelineTemplate: + name: inform + pipeline_class: rail.pipelines.estimation.inform_all.InformPipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + input: + flavor: baseline + tag: train + kwargs: + algorithms: ['all'] + - PipelineTemplate: + name: estimate + pipeline_class: rail.pipelines.estimation.estimate_all.EstimatePipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + input: + flavor: baseline + tag: test + kwargs: + algorithms: ['all'] + - PipelineTemplate: + name: evaluate + pipeline_class: rail.pipelines.evaluation.evaluate_all.EvaluationPipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + truth: + flavor: baseline + tag: test + kwargs: + algorithms: ['all'] + - PipelineTemplate: + name: pz + pipeline_class: rail.pipelines.estimation.pz_all.PzPipeline + input_catalog_template: degraded + output_catalog_template: degraded + input_file_templates: + input_train: + flavor: baseline + tag: train + input_test: + flavor: baseline + tag: test + kwargs: + algorithms: ['all'] +# These describe the selections going from "truth" to "reduced" catalog + +# These just all use different limiting i-band magnitudes +Selections: + - Selection: + name: maglim_25.5 + cuts: + maglim_i: [null, 25.5] + - Selection: + name: gold + cuts: + maglim_i: [null, 25.5] + - Selection: + name: blend + cuts: + maglim_i: [null, 26.0] + - Selection: + name: crap + cuts: + maglim_i: [null, 30.0] + - Selection: + name: all + cuts: + maglim_i: [null, null] + +# These describe the subsampling used to make test and training datasets +Subsamples: + # Testing subsample with 100 events + - Subsample: + name: test_100 + seed: 1234 + num_objects: 100 + # Training subsample with 100k events, and a different seed to select the events + - Subsample: + name: train_100 + seed: 4321 + num_objects: 100 + # Training subsample with 100k events, and a different seed to select the events + - Subsample: + name: train_101 + seed: 5555 + num_objects: 101 + +# These describe the ways we can split the data +Splitters: + - Splitter: + name: random_splitter + Split: RandomSplitter + Module: rail.projects.splitter diff --git a/tests/ci_project-cardinal.yaml b/tests/ci_project-cardinal.yaml new file mode 100644 index 0000000..8626729 --- /dev/null +++ b/tests/ci_project-cardinal.yaml @@ -0,0 +1,59 @@ +Project: + + Name: ci_test_cardinal + + # Include other configuration files + Includes: + - tests/ci_cardinal_project_library.yaml + + PathTemplates: {} + + CommonPaths: + root: /global/cfs/cdirs/lsst/groups/PZ/Cardinal + scratch_root: "{root}" + catalogs_dir: "{root}/parquet_files" + project: ci_test_cardinal + sim_version: v1 + + # Baseline configuraiton, included in others by default + Baseline: + catalog_tag: cardinal + pipelines: ['all'] + file_aliases: # Set the training and test files + test: test_file_100k + train: train_file_100k + train_zCOSMOS: train_file_zCOSMOS_100k + wide: wide_file_full + deep: deep_file_full + spec: spec_file_full + + # These define the variant configurations for the various parts of the analysis + Flavors: + - Flavor: + name: train_cosmo + pipelines: ['pz', 'tomography'] + file_aliases: # Set the training and test files + test: test_file_100k + train: train_file_zCOSMOS_100k + - Flavor + name: gpz_gl + pipelines: ['pz'] # only run the pz pipeline + pipeline_overrids: # Override specifics for particular pipelines + default: + kwargs + algorithms: ['gpz'] # Only run gpz + inform: + inform_gpz: + gpz_method: GL + + # These are variables that we iterate over when running over entire catalogs + IterationVars: + healpix: + - 430 + - 73 + - 122 + - 99 + - 16 + - 58 + - 277 + - 346 diff --git a/tests/ci_project_library.yaml b/tests/ci_project_library.yaml index af832a2..fc430c1 100644 --- a/tests/ci_project_library.yaml +++ b/tests/ci_project_library.yaml @@ -197,6 +197,11 @@ Reducers: Reduce: FlagshipReducer Module: rail.projects.reducer + - Reducer: + name: cardinal + Reduce: CardinalReducer + Module: rail.projects.reducer + # These describe the various data analysis pipelines Pipelines: diff --git a/tests/projects/test_cardinal_project.py b/tests/projects/test_cardinal_project.py new file mode 100644 index 0000000..15e229a --- /dev/null +++ b/tests/projects/test_cardinal_project.py @@ -0,0 +1,190 @@ +import os +from typing import Any, Callable + +import pytest + +from rail.projects.project import RailProject + + +def check_get_func(func: Callable, check_dict: dict[str, Any]) -> None: + for key, val in check_dict.items(): + check_val = func(key) + if isinstance(check_val, dict): + for kk, vv in check_val.items(): + assert vv == val[kk] + with pytest.raises(KeyError): + func("does_not_exist") + + +def test_project_doc() -> None: + RailProject.functionality_help() + RailProject.configuration_help() + + +def test_project_class(setup_project_area: int) -> None: + assert setup_project_area == 0 + + project = RailProject.load_config("tests/ci_cardinal_project.yaml") + + print(project) + + templates = project.get_path_templates() + check_get_func(project.get_path, templates) + + common_paths = project.get_common_paths() + check_get_func(project.get_common_path, common_paths) + + files = project.get_files() + check_get_func(project.get_file, files) + + flavors = project.get_flavors() + check_get_func(project.get_flavor, flavors) + all_flavors = project.get_flavor_args(["all"]) + assert set(all_flavors) == set(flavors.keys()) + assert project.get_flavor_args(["dummy"])[0] == "dummy" + + project.get_file_for_flavor("baseline", "test") + with pytest.raises(KeyError): + project.get_file_for_flavor("baseline", "does not exist") + + project.get_file_metadata_for_flavor("baseline", "test") + with pytest.raises(KeyError): + project.get_file_metadata_for_flavor("baseline", "does not exist") + + selections = project.get_selections() + check_get_func(project.get_selection, selections) + project.clear_cache() + check_get_func(project.get_selection, selections) + all_selections = project.get_selection_args(["all"]) + assert set(all_selections) == set(selections.keys()) + assert project.get_selection_args(["dummy"])[0] == "dummy" + + subsamples = project.get_subsamples() + check_get_func(project.get_subsample, subsamples) + + itr = project.generate_kwargs_iterable( + selections=all_selections, + flavors=all_flavors, + ) + for x_ in itr: + assert isinstance(x_, dict) + + error_models = project.get_error_models() + check_get_func(project.get_error_model, error_models) + + pz_algos = project.get_pzalgorithms() + check_get_func(project.get_pzalgorithm, pz_algos) + + spec_selections = project.get_spec_selections() + check_get_func(project.get_spec_selection, spec_selections) + + classifiers = project.get_classifiers() + check_get_func(project.get_classifier, classifiers) + + summarizers = project.get_summarizers() + check_get_func(project.get_summarizer, summarizers) + + catalogs = project.get_catalogs() + check_get_func(project.get_catalog, catalogs) + + pipelines = project.get_pipelines() + check_get_func(project.get_pipeline, pipelines) + + _ceci_command = project.generate_ceci_command( + pipeline_path="dummy.yaml", + config=None, + inputs={"bob": "bob.pkl"}, + output_dir=".", + log_dir=".", + alice="bob", + ) + + project.build_pipelines(flavor="baseline") + + catalog_files_truth = project.get_catalog_files("truth") + check_path = "tests/temp_data/data/ci_test_v1.1.3/58/cardinal_test-0.parquet" + assert check_path in catalog_files_truth + + catalog_files_reduced = project.get_catalog_files("reduced", selection="gold") + check_path = "tests/temp_data/data/ci_test_v1.1.3/ci_test_gold/58/cardinal_test_file-0.pq" + assert check_path in catalog_files_reduced + + catalog_files_degraded = project.get_catalog_files( + "degraded", selection="gold", flavor="baseline", basename="output.hdf5" + ) + check_path = "tests/temp_data/data/ci_test_v1.1.3/ci_test_gold_baseline/58/output.hdf5" + assert check_path in catalog_files_degraded + + project.reduce_data( + catalog_template="truth", + output_catalog_template="reduced", + reducer_class_name="cardinal", + input_selection="", + selection="gold", + ) + + project.subsample_data( + catalog_template="reduced", + file_template="train_100", + subsampler_class_name="random_subsampler", + subsample_name="train_100", + flavor="baseline", + selection="gold", + ) + + project.split_data( + file_template="train_100", + test_file_template="test_split_file", + train_file_template="train_split_file", + splitter_class_name="random_splitter", + flavor="baseline", + selection="gold", + ) + + single_ceci_command = project.make_pipeline_single_input_command( + pipeline_name="pz", + flavor="baseline", + selection="gold", + ) + assert single_ceci_command + + ceci_catalog_commands = project.make_pipeline_catalog_commands( + pipeline_name="spec_selection", + flavor="baseline", + selection="gold", + spec_selections=list(project.get_spec_selections().keys()), + ) + assert ceci_catalog_commands + + project.write_yaml("tests/temp.yaml") + + RailProject.projects.clear() + + project = RailProject.load_config("tests/temp.yaml") + os.unlink("tests/temp.yaml") + project.get_file_for_flavor("baseline", "test") + + project.add_flavor( + name="test_flavor", + catalog_tag="cardinal", + pipelines=["pz"], + ) + with pytest.raises(KeyError): + project.add_flavor( + name="test_flavor", + catalog_tag="cardinal", + pipelines=["pz"], + ) + + flavor_info = project.get_flavor("test_flavor") + assert flavor_info + + model_path = ( + "tests/temp_data/projects/ci_test/data/blend_baseline/model_inform_knn.pkl" + ) + project.wrap_pz_model( + model_path, + "tests/temp_data", + selection="blend", + flavor="baseline", + )