forked from Vindaar/TimepixAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreateAllLikelihoodCombinations.nim
More file actions
374 lines (333 loc) · 14.8 KB
/
createAllLikelihoodCombinations.nim
File metadata and controls
374 lines (333 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
import std / [sequtils, strformat, strutils, os, sets, algorithm, sugar]
from std / times import epochTime
import shell
from ingrid / ingrid_types import ChipRegion
import cligen / [procpool, mslice, osUt]
#[
Computes all combinations of the following cases
- for year in years of data files given:
- for region in chip region:
- no vetoes
- +scinti veto
- +FADC veto
- +septem veto
- +line veto
All veto combinations are treated additive, with the priority being the one
defined by the local `FlagKind`, but only including the ones you specify!
For that reason it is not possible to in one go generate all *individual* veto
cases at the same time as computing the combined vetoes. Maybe this will be added
in the future.
It will also write the output written by the program to the same output directory
with the same name, but a `.log` extension.
]#
##: XXX: ADD THE `--tracking` FLAG IF DESIRED
type
## A simplified reproduction of the `likelihood` `FlagKind` type only containing the
## aspects we care about
FlagKind = enum
fkLogL # = "",
fkMLP # = "--mlp"
fkScinti # = "--scintiveto"
fkFadc # = "--fadcveto"
fkSeptem # = "--septemveto"
fkLineVeto # = "--lineveto"
fkExclusiveLineVeto # line veto *without* septem veto & lvRegular, ecc cut 1.0
# important this is last!
Veto = object
kind: FlagKind
additional: bool # whether this veto is treated as an additional or also standalone
Settings = object
year: int
region: ChipRegion
signalEff: float # the lnL signal efficiency
eccentricityCutoff: float # line veto eccentricity cutoff
vetoes: set[FlagKind]
vetoPercentile: float # FADC veto percentile to use
Combination = object
fname: string
calib: string
mlpPath: string # path to MLP model file (this really is part of a `Setting`, but for convenience here)
settings: Settings
proc toStr(fk: FlagKind): string =
case fk
of fkLogL: "--lnL"
of fkMLP: "" # must be filled with an argument!
of fkScinti: "--scintiveto"
of fkFadc: "--fadcveto"
of fkSeptem: "--septemveto"
of fkLineVeto: "--lineveto"
of fkExclusiveLineVeto: "--lineveto"
proc `$`(v: Veto): string =
if v.additional:
result = "+"
result.add v.kind.toStr()
proc contains(s: HashSet[Veto], f: FlagKind): bool =
for x in s:
if x.kind == f: return true
iterator items(s: HashSet[Veto]): Veto =
# yield elements in order of `FlagKind`
var vSeq: seq[Veto]
var ms = s
while ms.len > 0:
vSeq.add ms.pop
vSeq = vSeq.sortedByIt(it.kind)
for x in vSeq:
yield x
proc genVetoStr(vetoes: set[FlagKind], comb: Combination): string =
for v in vetoes:
result = result & " " & v.toStr()
if fkMLP in vetoes:
result = result & " --mlp " & comb.mlpPath
template yieldFadc(comb, cfg: untyped): untyped {.dirty.} =
if fkFadc in flags and fadcVetoPercentiles.len > 0: # if FADC contained, yield all percentiles after another
for perc in fadcVetoPercentiles:
cfg.vetoPercentile = perc
comb.settings = cfg
yield comb
else:
comb.settings = cfg
yield comb
template yieldLineVeto(comb, cfg: untyped): untyped {.dirty.} =
if (fkLineVeto in flags or fkExclusiveLineVeto in flags) and eccCutoffs.len > 0:
for ecc in eccCutoffs:
cfg.eccentricityCutoff = ecc
comb.settings = cfg
yieldFadc(comb, cfg)
else:
yieldFadc(comb, cfg)
template yieldVetoes(): untyped {.dirty.} =
for vSet in vetoSets:
var flags: set[FlagKind]
for veto in vSet: # iterate over `HashSet[Veto]`, adding this veto to the current flags
flags.incl veto.kind # combination is therefore returned
if veto.kind == fkExclusiveLineVeto:
# remove septem veto
flags.excl fkSeptem
flags.excl fkLineVeto # don't need line veto anymore
if veto.additional: # in this case we *do not* yield, e.g. `+fkScinti` won't run with scinti only, accumulate
continue
if fkLogL in flags and fkMLP in flags:
raise newException(ValueError, "lnL and NN cuts cannot be combined!")
var cfg = Settings(year: year,
region: region,
signalEff: eff,
vetoes: flags,
vetoPercentile: -1.0,
eccentricityCutoff: 1.0)
var comb = Combination(fname: fname, calib: calib,
settings: cfg)
if fkMLP in flags and mlpPaths.len == 0:
raise newException(ValueError, "If `fkMLP` desired please provide at least one `mlpPath`.")
elif fkMLP in flags:
for mlpPath in mlpPaths:
comb.mlpPath = mlpPath
yieldLineVeto(comb, cfg)
else:
yieldLineVeto(comb, cfg)
iterator genCombinations(f2017, f2018: string,
c2017, c2018: string,
regions: set[ChipRegion],
mlpPaths: seq[string],
signalEff: seq[float],
vetoSets: seq[HashSet[Veto]],
fadcVetoPercentiles: seq[float],
eccCutoffs: seq[float]
): Combination =
for tup in zip(@[f2017, f2018].filterIt(it.len > 0), @[c2017, c2018]):
let (fname, calib) = (tup[0], tup[1])
let year = if fname == f2017: 2017 else: 2018
for region in regions:
if signalEff.len > 0:
for eff in signalEff:
yieldVetoes()
else:
let eff = 0.0 # local zero so that it acts as none given
yieldVetoes()
proc buildFilename(comb: Combination, outpath: string): string =
let cfg = comb.settings
let runPeriod = if cfg.year == 2017: "R2" else: "R3"
result = &"{outpath}/lhood_c18_{runPeriod}_{cfg.region}"
if cfg.signalEff > 0.0:
result.add &"_sEff_{cfg.signalEff}"
for v in cfg.vetoes:
let vetoStr = (v.toStr).replace("--", "").replace("veto", "")
if vetoStr.len > 0: # avoid double `_`
result = result & "_" & vetoStr
if fkMLP in cfg.vetoes:
result = result & "_mlp_" & comb.mlpPath.extractFilename.dup(removeSuffix(".pt"))
if cfg.vetoPercentile >= 0.0:
result = result & "_vQ_" & $cfg.vetoPercentile
if cfg.eccentricityCutoff > 1.0:
result = result & "_ecc_" & $cfg.eccentricityCutoff
result = result & ".h5"
proc runCommand(comb: Combination, cdlFile, outpath: string,
cdlYear: int, dryRun, readOnly, tracking: bool): int =
let cfg = comb.settings
let vetoStr = genVetoStr(cfg.vetoes, comb)
let outfile = buildFilename(comb, outpath)
let regionStr = &"--region={cfg.region}"
let cdlYear = &"--cdlYear={cdlYear}"
let cdlFile = &"--cdlFile={cdlFile}"
let calibFile = if fkFadc in cfg.vetoes or fkMLP in cfg.vetoes: &"--calibFile={comb.calib}"
else: ""
let vetoPerc = if cfg.vetoPercentile > 0.0: &"--vetoPercentile={cfg.vetoPercentile}" else: ""
let readOnly = if readOnly: "--readOnly" else: ""
let sigPrefix = if comb.mlpPath.len > 0: "nnSignalEff" else: "signalEfficiency"
let signalEff = if cfg.signalEff > 0.0: &"--{sigPrefix}={cfg.signalEff}" else: ""
let eccCutoff = if cfg.eccentricityCutoff > 1.0: &"--eccLineVetoCut={cfg.eccentricityCutoff}" else: ""
let trackingStr = if tracking: "--tracking" else: ""
let fname = comb.fname
if not dryRun:
let (res, err) = shellVerbose:
"likelihood -f" ($fname) "--h5out" ($outfile) ($regionStr) ($cdlYear) ($vetoStr) ($cdlFile) ($readOnly) ($calibFile) ($vetoPerc) ($signalEff) ($eccCutoff) ($trackingStr)
# first write log file
let logOutput = outfile.extractFilename.replace(".h5", ".log")
writeFile(&"{outpath}/{logOutput}", res)
# then check error code. That way we have the log at least!
if err != 0:
writeFile(&"{outpath}/{logOutput}.error", "This command failed! Check its log.")
result = err
else:
shellEcho:
"likelihood -f" ($fname) "--h5out" ($outfile) ($regionStr) ($cdlYear) ($vetoStr) ($cdlFile) ($readOnly) ($calibFile) ($vetoPerc) ($signalEff) ($eccCutoff) ($trackingStr)
type
InputData = object
fname: array[512, char] # fixed array for the data filename
calib: array[512, char] # fixed array for filename of calibration file
mlpPath: array[512, char] # fixed array for the path to the (optional) MLP
settings: Settings
proc toArray(s: string): array[512, char] = # could mem copy, but well
doAssert s.len < 512
for i in 0 ..< s.len:
result[i] = s[i]
proc fromArray(ar: array[512, char]): string =
result = newStringOfCap(512)
for i in 0 ..< 512:
if ar[i] == '\0': break
result.add ar[i]
proc `$`(id: InputData): string =
$(fname: id.fname.fromArray(), calib: id.calib.fromArray(), mlpPath: id.mlpPath.fromArray(),
settings: id.settings)
proc toInputData(comb: Combination): InputData =
result = InputData(fname: comb.fname.toArray(), calib: comb.calib.toArray(),
mlpPath: comb.mlpPath.toArray(),
settings: comb.settings)
proc toCombination(data: InputData): Combination =
result = Combination(fname: data.fname.fromArray(), calib: data.calib.fromArray(),
mlpPath: data.mlpPath.fromArray(),
settings: data.settings)
proc errorReport(errors: seq[string]): string =
result = "=== Error summary ===\n"
result.add "\tThe following errors were encountered during the process of all jobs:"
for i, err in errors:
result.add &"Error #{i}: {err}\n"
proc main(f2017, f2018: string = "", # paths to the Run-2 and Run-3 data files
c2017, c2018: string = "", # paths to the Run-2 and Run-3 calibration files (needed for FADC veto)
regions: set[ChipRegion], # which chip regions to compute data for
vetoSets: seq[HashSet[Veto]] = @[],
cdlFile: string,
outpath = "out",
cdlYear = 2018,
dryRun = false,
multiprocessing = false,
mlpPaths: seq[string] = @[], # which MLP models to run with
fadcVetoPercentiles: seq[float] = @[],
signalEfficiency: seq[float] = @[],
eccentricityCutoff: seq[float] = @[],
jobs = 8,
tracking = false,
) =
if vetoSets.anyIt(fkFadc in it) and ( # stop if FADC veto used but calibration file missing
(f2017.len > 0 and c2017.len == 0) or
(f2018.len > 0 and c2018.len == 0)):
doAssert false, "When using the FADC veto the corresponding calibration file to the background " &
"data file is required."
# potentially create the output directory
discard existsOrCreateDir(outpath)
if not multiprocessing: # run all commands in serial
var errors = newSeq[string]()
for comb in genCombinations(f2017, f2018, c2017, c2018, regions, mlpPaths, signalEfficiency, vetoSets, fadcVetoPercentiles, eccentricityCutoff):
var err = -1
while err != 0:
err = runCommand(comb, cdlFile, outpath, cdlYear, dryRun, readOnly = false, tracking = tracking)
if err != 0:
let errMsg = "[ERROR] The command `{cmd}` failed. Retrying now."
errors.add errMsg
# remove the generated output file (who knows what went wrong!
removeFile(buildFilename(comb, outpath))
if errors.len > 0:
echo errorReport(errors)
else:
var cmds = newSeq[InputData]()
for comb in genCombinations(f2017, f2018, c2017, c2018, regions, mlpPaths, signalEfficiency, vetoSets, fadcVetoPercentiles, eccentricityCutoff):
cmds.add comb.toInputData()
for cmd in cmds:
echo "Command: ", cmd, ", tracking? ", tracking
echo "As filename: ", buildFilename(cmd.toCombination(), outpath)
if not dryRun:
# run them using a procpool
let t0 = epochTime()
let jobs = jobs # running with 28 jobs _definitely_ runs out of RAM on a machine with 64GB. 10 seems to work fine.
# However, most of the jobs are done very quickly anyway. The crAll (esp incl septem/line veto)
# are by far the slowest. So while 10 is slower than 28, the difference is small.
## See note at the bottom of the file.
# We use a cligen procpool to handle running all jobs in parallel
var pp = initProcPool((
proc(r, w: cint) =
let i = open(r)
var o = open(w, fmWrite)
var cmd: InputData
var errors = newSeq[string]()
while i.uRd(cmd):
echo "Running value: ", cmd
var err = -1
while err != 0:
let comb = cmd.toCombination()
err = runCommand(comb, cdlFile, outpath, cdlYear, dryRun, readOnly = true, tracking = tracking)
if err != 0:
let errMsg = "[ERROR] The command `{cmd}` failed. Retrying now."
errors.add errMsg
# remove the generated output file (who knows what went wrong!
removeFile(buildFilename(comb, outpath))
discard w.wrLine "INFO: Finished input pair: " & $cmd
if errors.len > 0:
let errRep = errorReport(errors)
writeFile(&"/home/basti/Sync/error_report_job_r_{r}_w_{w}.txt", errRep)
), framesLines, jobs)
proc prn(m: MSlice) = echo m
pp.evalOb cmds, prn
echo "Running all likelihood combinations took ", epochTime() - t0, " s"
when isMainModule:
import cligen/argcvt
from strutils import parseEnum
proc argParse(dst: var HashSet[Veto], dfl: HashSet[Veto],
a: var ArgcvtParams): bool =
var vals = a.val.strip(chars = {'{', '}'}).split(',')
for val in vals:
let valC = val.strip
let additional = if valC[0] == '+': true else: false
try:
let kind = parseEnum[FlagKind](valC.strip(chars = {'+'}))
dst.incl(Veto(kind: kind, additional: additional))
except ValueError:
raise newException(Exception, "Invalid flag given: " & $valC)
result = true
import cligen
dispatch main
#[
A test run of:
```
likelihood -f ~/CastData/data/DataRuns2017_Reco.h5 --h5out /tmp/blabla.h5 --region crAll --cdlYear 2018 --cdlFile ~/CastData/data/CDL_2019/calibration-cdl-2018.h5 --lineveto --scintiveto --fadcveto --calibFile ~/CastData/data/CalibrationRuns2017_Reco.h5
```
peaked at 10.1GB of used memory according to `htop`.
Interestingly it seemed to increase only at the _end_ of each run, probably during the
writing portion of the code.
It's a bit annoying that the memory seems to increase significantly each time. As expected run
186 (the 11 day run) caused the largest spike.
Let's test by compiling `likelihood` with `-d:useMalloc` to see if we actually give memory
back to the system.
Update: This seems to have done the trick! When we reached run 186 in the initial run without
using malloc we were already at 7GB and after at over 9. This time we are at 3.3GB with a very
short peak to 4.1GB during the (likely?) writing period, but a direct drop down to about 3GB again!
So instead of increasing we are actually giving memory back.
]#