uniform_batch_correction/nextflow_schema.json at main · BioimageAnalysisCoreWEHI/uniform_batch_correction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/BioimageAnalysisCoreWEHI/uniform_batch_correction/main/nextflow_schema.json",
  "title": "uniform_batch_correction pipeline parameters",
  "description": "Standalone UniFORM-style normalization for cellmeasurement GeoJSON outputs",
  "type": "object",
  "required": ["input", "outdir"],
  "properties": {
    "input": {
      "type": "string",
      "format": "file-path",
      "exists": true,
      "schema": "assets/schema_input.json",
      "mimetype": "text/csv",
      "pattern": "^\\S+\\.(csv|y(a)?ml)$",
      "description": "Path to input samplesheet (CSV or YAML). Must contain at least 'sample' plus one of 'geojson', 'ome_tiff', or 'adata' columns/fields."
    },
    "outdir": {
      "type": "string",
      "format": "directory-path",
      "description": "Output directory for normalized GeoJSONs, QC artifacts, and versions metadata. Results are published under subfolders such as uniformnormalize/ and uniformnormalize/qc/."
    },
    "run_uniform": {
      "type": "boolean",
      "default": true,
      "description": "Master switch for UniFORM-style normalization. If true, per-key cohort alignment is applied and normalized GeoJSON files are written. If false, inputs are passed through unchanged (useful for baseline comparisons)."
    },
    "uniform_apply_to": {
      "type": "string",
      "default": "geojson",
      "enum": ["geojson", "ome_tiff", "adata"],
      "description": "Select normalization target for this run: geojson (cellmeasurement numeric fields), ome_tiff (pixel-level per-channel scaling), or adata (feature-level AnnData matrix scaling)."
    },
    "uniform_num_bins": {
      "type": "integer",
      "default": 1024,
      "minimum": 32,
      "maximum": 8192,
      "description": "Number of histogram bins used when estimating per-key distribution shifts across samples. Higher values provide finer alignment but can increase noise sensitivity and runtime; lower values are smoother and faster but less precise."
    },
    "uniform_min_value": {
      "type": "number",
      "default": 1.0,
      "minimum": 0.0,
      "description": "Minimum measurement value retained before log transform and shift estimation. Values below this threshold are excluded from alignment calculations. Increase if low-intensity noise dominates."
    },
    "uniform_exclude_pattern": {
      "type": "string",
      "default": "^(kronos_|emb_)",
      "description": "Regular expression used to exclude measurement keys from normalization. Any key matching this pattern is copied unchanged to outputs (default protects KRONOS/embedding fields)."
    },
    "uniform_output_suffix": {
      "type": "string",
      "default": "_uniform",
      "description": "Suffix appended to normalized output filenames before .geojson (e.g. sample.geojson -> sample_uniform.geojson)."
    },
    "uniform_pixel_output_suffix": {
      "type": "string",
      "default": "_unifrom",
      "description": "Suffix appended to normalized OME-TIFF/TIFF filenames (e.g. sample.ome.tiff -> sample_unifrom.ome.tiff)."
    },
    "uniform_pixel_sample_size": {
      "type": "integer",
      "default": 200000,
      "minimum": 1000,
      "maximum": 5000000,
      "description": "Maximum sampled pixels per channel per image for histogram shift estimation in pixel mode. Higher values improve stability but increase runtime/memory."
    },
    "uniform_pixel_group_by": {
      "type": "string",
      "default": "image",
      "enum": ["image", "batch"],
      "description": "Grouping strategy for pixel mode. 'image' computes per-image scaling (default). 'batch' computes one scale per batch and applies it to all images in that batch."
    },
    "uniform_pixel_batch_map": {
      "type": "string",
      "default": "",
      "description": "Path to CSV/TSV table used when uniform_pixel_group_by=batch. Must include sample and batch columns (configure with uniform_pixel_batch_sample_column and uniform_pixel_batch_column)."
    },
    "uniform_pixel_batch_sample_column": {
      "type": "string",
      "default": "sample",
      "description": "Column name in uniform_pixel_batch_map containing sample IDs. IDs must match OME-TIFF sample IDs derived from filename stem."
    },
    "uniform_pixel_batch_column": {
      "type": "string",
      "default": "batch",
      "description": "Column name in uniform_pixel_batch_map containing batch labels for pixel batch normalization."
    },
    "uniform_adata_group_by": {
      "type": "string",
      "default": "image",
      "description": "obs column used to define per-sample groups in AnnData mode. Each unique value in this column is normalized as one sample (default: image)."
    },
    "uniform_adata_sample_size": {
      "type": "integer",
      "default": 200000,
      "minimum": 1000,
      "maximum": 5000000,
      "description": "Maximum sampled observations per AnnData group per feature used for histogram shift estimation in adata mode."
    },
    "uniform_adata_target": {
      "type": "string",
      "default": "all",
      "enum": ["all", "cell_mean"],
      "description": "Simple AnnData feature target preset. Use 'cell_mean' to normalize only features ending with _Cell_Mean, or 'all' for all features."
    },
    "uniform_adata_filter_column": {
      "type": "string",
      "default": "",
      "description": "Optional var column used for selecting which AnnData features to normalize (e.g. statistic, marker, feature_type). If empty, filtering is applied to inferred feature names."
    },
    "uniform_adata_filter_regex": {
      "type": "string",
      "default": "",
      "description": "Optional regex used to select AnnData features for normalization. Only matching features are scaled; non-matching features remain unchanged."
    },
    "uniform_generate_plots": {
      "type": "boolean",
      "default": true,
      "description": "If true, generate QC plots including before/after distribution overlays and scale-factor heatmaps. Disable for faster non-visual runs or minimal output mode."
    },
    "uniform_qc_top_n_keys": {
      "type": "integer",
      "default": 40,
      "minimum": 1,
      "maximum": 200,
      "description": "Number of top-ranked (most adjusted) measurement keys to render as before/after histogram overlays. Larger values improve coverage but produce more plot files and longer plotting time."
    },
    "uniform_qc_max_heatmap_keys": {
      "type": "integer",
      "default": 40,
      "minimum": 1,
      "maximum": 500,
      "description": "Maximum number of measurement keys displayed in the scale-factor heatmap (rows). Higher values provide broader overview across features; lower values improve readability and reduce figure density."
    }
  }
}