Skip to content

Commit fad28de

Browse files
committed
wip(pnote): as functions, default to write sidecar
1 parent f33954e commit fad28de

1 file changed

Lines changed: 167 additions & 109 deletions

File tree

pnote

Lines changed: 167 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,18 @@ Like datalad, but not linked to source control versioning
1313
OPTIONS:
1414
-o OUTPUT_FILE output, can repeat
1515
-i INPUT_FILE input file, can repeat
16+
-l FILELIST file to pull list of inputs from
17+
can be named pipe like: <(ls *.nii.gz)
18+
NB. newline in file names will be a problem
1619
-s|--stdout FILE capture stdoutput to FILE
1720
-e|--ifneeded only run if output(s) older than input(s)
1821
pnumoic for 'e': *e*xisting skipped
1922
or if hashes changed
2023
-d|--db DBPATH specify sqlite3 db path. Alt set env PNOTEDB
2124
if git, goes into .pnote.sqlite3
2225
otherwise \$PWD/.pnote.sqlite3
23-
-c|--sidecar write to first output file sidecar .output.pnote
26+
-C|--no-sidecar SKIP writing output file sidecar like .output.pnote
27+
-D|--no-db SKIP writing to DB
2428
-n|--dryrun save as setting DRYRUN=1. show don't do
2529
TODO:
2630
* faketime for reproducabilty?
@@ -47,35 +51,29 @@ create table prov (
4751
create table inf (hash text, stime int, file text, modtime int);
4852
create table outf (hash text, stime int, file text);
4953
create table meta (inittime datetime, version text, inithost text);
50-
insert into meta values (datetime('now'), '$PNOTEVER', '$HOSTNAME');"
54+
insert into meta values (unixepoch('now'), '$PNOTEVER', '$HOSTNAME');"
5155

52-
record_to(){
53-
declare -g DBFILE
54-
local backend="${1:?need provenance backend}"; local cmd_output=${2:-$PWD}
55-
case $backend in
56-
db)
57-
# PNOTEDB overwrite, otherwise git root, otherwise PWD
58-
db="${PNOTEDB:-}"
59-
[ -z "$db" ] &&
60-
db=$(git rev-parse --show-toplevel || echo "$PWD")/.pnote.sqlite3
56+
set_db(){
57+
declare -g DBFILE DO_DB
58+
# disalbed DB has empty DBFILE
59+
if [ "${DO_DB:-1}" -ne 1 ]; then
60+
DBFILE=""
61+
return 0
62+
fi
63+
# PNOTEDB overwrite, otherwise git root, otherwise PWD
64+
db="${PNOTEDB:-}"
65+
[ -z "$db" ] &&
66+
db=$(git rev-parse --show-toplevel || echo "$PWD")/.pnote.sqlite3
6167

62-
if ! test -r "$db"; then
63-
echo "# NOTE: make '$db'" >&2
64-
dryrun sqlite3 "$db" <<< "$SCHEMA"
65-
fi
66-
DBFILE=$db
67-
;;
68-
sidecar)
69-
sidecar=$(dirname "$cmd_output")/.$(basename "$cmd_output")
70-
# make we don't have a trailing slash to make this a directory
71-
sidecar=$(sed 's:/+$::' <<< "$sidecar")
72-
echo "$sidecar"
73-
DBFILE=$sidecar
74-
;;
75-
esac
68+
if ! test -r "$db"; then
69+
echo "# NOTE: make '$db'" >&2
70+
dryrun sqlite3 "$db" <<< "$SCHEMA"
71+
fi
72+
DBFILE=$db
7673
}
7774
record_start(){
7875
declare -g DBFILE
76+
[ -z "$DBFILE" ] && return 0
7977
local cmd_hash="$1"; shift
8078
local now="$1"; shift
8179
local cmd=$(printf "%q" "$1"); shift
@@ -90,6 +88,7 @@ HERE
9088
}
9189
record_in(){
9290
declare -g DBFILE
91+
[ -z "$DBFILE" ] && return 0
9392
local h=${1:?cmd hash from start}; shift
9493
local n=${1:?start time of command}; shift
9594
# all other inputs are like 'moddate file'
@@ -108,6 +107,7 @@ record_in(){
108107
}
109108
record_out(){
110109
declare -g DBFILE
110+
[ -z "$DBFILE" ] && return 0
111111
local h=${1:?cmd hash from start}; shift
112112
local n=${1:?start time of command}; shift
113113
while [ $# -gt 0 ]; do
@@ -117,111 +117,169 @@ record_out(){
117117
}
118118

119119
record_done(){
120-
local h=${1:?} n=${2:?} s=${3:?}
121-
sqlite3 "$DBFILE" "
122-
update prov set
123-
etime=datetime('now'), status = '$s'
124-
where hash='$h' and stime='$s';"
120+
declare -g DBFILE
121+
[ -z "$DBFILE" ] && return 0
122+
local h=${1:?} n=${2:?} s=${3:?}
123+
sqlite3 "$DBFILE" "
124+
update prov set
125+
etime=unixepoch('now'), status = '$s'
126+
where hash='$h' and stime='$n';"
125127
}
126128

129+
# TODO: show sidecar version?
127130
show_db(){
128-
record_to "$BACKEND"
131+
declare -g DBFILE
132+
set_db
129133
sqlite3 -separator $'\t' -header "$DBFILE" <<HERE
130-
select status, o.stime, etime, cmd from prov p join outf o on p.hash=o.hash where file like '$1';
134+
select p.status, strftime('%F %T',p.etime) etime, p.cmd from prov p join outf o on p.hash=o.hash where file like '$1' order by p.etime desc limit 1;
131135
HERE
132136
}
133137

134138
_OUT=()
135139
_IN=()
136-
stdout=
137-
ifneeded=0
138-
BACKEND="db"
139-
while [ $# -ne 0 ]; do
140-
case "$1" in
141-
show) show_db "$2"; shift 2; exit ;;
142-
--) shift; break;;
143-
-o|--output)
144-
_OUT+=("${2:?$1 must be followed by an output file}"); shift 2;;
145-
--stdout|-s)
146-
_OUT+=("${2:?$1 must be followed by an output file}");
147-
stdout=$2;
148-
shift 2;;
149-
-i|--input)
150-
_IN+=("${2:?$1 must be following by an input file}"); shift 2;;
151-
-e|--ifneeded) ifneeded=1; shift;;
152-
-c|--sidecar) BACKEND="db"; shift;
153-
echo "sidecar not implemented yet"; exit 1;;
154-
-n|--dryrun) DRYRUN=echo; shift;;
155-
-d|--db) PNOTEDB=${2:?--db requires db path}; shift 2;;
140+
STDOUT=
141+
IFNEEDED=0
142+
DO_SIDECAR=1
143+
DO_DB=1
144+
cmd_and_args=()
145+
pnote_parse(){
146+
declare -g _OUT _IN STDOUT IFNEEDED DO_SIDECAR DO_DB cmd_and_args
147+
while [ $# -ne 0 ]; do
148+
case "$1" in
149+
--) shift; break;; # rest is command
150+
show) show_db "$2"; shift 2; exit ;;
151+
-o|--output)
152+
_OUT+=("${2:?$1 must be followed by an output file}"); shift 2;;
153+
--stdout|-s)
154+
_OUT+=("${2:?$1 must be followed by an output file}");
155+
STDOUT=$2;
156+
shift 2;;
157+
-i|--input)
158+
_IN+=("${2:?$1 must be following by an input file}"); shift 2;;
159+
-l|--input_list)
160+
mapfile inlinst -t < "${2:?must have input file list}"
161+
_IN+=("${inlinst[@]}");
162+
shift 2;;
163+
164+
-d|--db) PNOTEDB=${2:?--db requires db path}; shift 2;;
156165

157-
# die on unknown options
158-
-*)
159-
echo "ERROR: unknown option $1" >&2; exit 1;;
166+
-e|--ifneeded) IFNEEDED=1; shift;;
167+
-n|--dryrun) DRYRUN='echo'; shift;;
168+
# disable things
169+
-C|--no-sidecar) DO_SIDECAR=0; shift;;
170+
-D|--no-db) DO_DB=0; shift;;
171+
172+
# die on unknown options
173+
-*)
174+
echo "ERROR: unknown option $1" >&2; exit 1;;
175+
176+
# anything else is likely a command
177+
*) break;;
178+
esac
179+
done
160180

161-
# anything else is likely a command
162-
*) break;;
163-
esac
164-
done
181+
cmd_and_args=("$@")
182+
}
165183

166184
stat_exist(){
167-
default=${1?:missing file number. '0' for oldest; 'error' to break}; shift;
185+
default=${1:?missing file number. '0' for oldest; 'error' to break}; shift;
168186
for f in "$@"; do
169187
test -e "$f" && stat -c "%Y %n" "$f" && continue
170188

171189
[[ $default == "error" ]] && echo "ERROR: missing file '$f'" >&2 && return 1
172190
echo "$default $f"
173191
done
174192
}
193+
mkhash(){
194+
machine=''
195+
# maybe we want to include machine info? disabled for now
196+
: || machine="$USER@$HOSTNAME:$PWD"
197+
md5sum <<< "$machine$*" | sed 's/ \+-$//'
198+
}
175199

176-
#cmd_and_args=("$(printf "%q " "$@")")
177-
cmd_and_args=("$@")
178-
cmd_hash=$(md5sum <<< "$USER@$HOSTNAME:$PWD ${cmd_and_args[*]}" | sed 's/ -$//')
179-
now=$(date +%s)
180-
out_times=("$now") # default for no output: set as always up-to-date
181-
in_times=()
182-
if [ ${#_IN[@]} -gt 0 ]; then
183-
mapfile -t in_times < <(stat_exist error "${_IN[@]}" | sort -n)
184-
[ ${#in_times[@]} -ne ${#_IN[@]} ] &&
185-
echo "ERROR: not all input files exist" >&2 &&
186-
exit 1
187-
fi
188-
[ ${#_OUT[@]} -gt 0 ] &&
189-
mapfile -t out_times < <(stat_exist 0 "${_OUT[@]}" | sort -nr) # oldest first
190-
newest_in="${in_times[0]:-0// */}" # remove name, just stat
191-
oldest_out="${out_times[0]// */}"
192-
# TODO: if no output but still want run if needed?
193-
oldest_out=${oldest_out?-0} # no output, make start of time.
194-
# if newest output is older than newest input AND cmd_hash is same, nothing to do?
195-
if [[ $ifneeded -eq 1 && \
196-
-n "${newest_in}" && -n "${oldest_out}" && \
197-
${newest_in} -lt ${oldest_out} \
198-
]]; then # `# check cmd_hash?`
199-
echo "# not rerunning: input '${in_times[0]}' $(bc -l <<< "$oldest_out - $newest_in") seconds older than output '${out_times[0]}'" >&2
200-
exit 0
201-
fi
200+
# given 'path/x.txt'; return 'path/.x.txt.pnote'
201+
find_sidecar(){
202+
local sidecar cmd_output=${1:?output file}
203+
sidecar=$(dirname "$cmd_output")/.$(basename "$cmd_output").pnote
204+
# make we don't have a trailing slash to make this a directory
205+
sidecar=$(sed 's:/\+$::' <<< "$sidecar")
206+
echo "$sidecar"
207+
}
208+
write_sidecar(){
209+
[ "${DO_SIDECAR:-1}" -ne 1 ] && return 0
210+
local cmd_output=${1:?output file needed}; shift
211+
local cmd=${1:?command that was run needed}; shift
212+
local input_files="$*"
213+
214+
sidecar=$(find_sidecar "$cmd_output")
215+
# test permissions
216+
s=0
217+
touch "$sidecar" 2>/dev/null || s=$?
218+
if [ $s -ne 0 ]; then
219+
echo "WARNING: cannot write to sidecar '$sidecar' (err $s)" >&2
220+
return $s
221+
fi
222+
printf "%s # %s\n" "$cmd" "$input_files" >> "$sidecar"
223+
}
202224

203-
# nothing to do, just exit
204-
if [ -n "${DRYRUN:-}" ]; then
205-
echo "${cmd_and_args[*]}"
225+
pnote_main(){
226+
pnote_parse "$@" # set
227+
#cmd_and_args=("$(printf "%q " "$@")")
228+
cmd_hash=$(mkhash "$@")
229+
started=$(date +%s)
230+
out_times=("$started") # default for no output: set as always up-to-date
231+
in_times=()
232+
if [ ${#_IN[@]} -gt 0 ]; then
233+
mapfile -t in_times < <(stat_exist error "${_IN[@]}" | sort -n)
234+
[ ${#in_times[@]} -ne ${#_IN[@]} ] &&
235+
echo "ERROR: not all input files exist" >&2 &&
236+
exit 1
237+
fi
238+
[ ${#_OUT[@]} -gt 0 ] &&
239+
mapfile -t out_times < <(stat_exist 0 "${_OUT[@]}" | sort -nr) # oldest first
240+
newest_in="${in_times[0]:-0// */}" # remove name, just stat
241+
oldest_out="${out_times[0]// */}"
242+
# TODO: if no output but still want run if needed?
243+
oldest_out=${oldest_out:-0} # no output, make start of time.
244+
# if newest output is older than newest input AND cmd_hash is same, nothing to do?
245+
if [[ $IFNEEDED -eq 1 && \
246+
-n "${newest_in}" && -n "${oldest_out}" && \
247+
${newest_in} -lt ${oldest_out} \
248+
]]; then # `# check cmd_hash?`
249+
echo "# not rerunning: input '${in_times[0]}' $(bc -l <<< "$oldest_out - $newest_in") seconds older than output '${out_times[0]}'" >&2
206250
exit 0
207-
fi
208-
209-
# set DBFILE
210-
record_to "$BACKEND"
211-
212-
record_start "$cmd_hash" "$now" "${cmd_and_args[*]}"
213-
record_in "$cmd_hash" "$now" "${in_times[@]}"
214-
record_out "$cmd_hash" "$now" "${_OUT[@]}"
215-
216-
if [ -n "$stdout" ]; then
217-
set +e # allow this to fail
218-
"${cmd_and_args[@]}" > "$stdout"
219-
else
220-
set +e # allow this to fail
221-
"${cmd_and_args[@]}"
222-
fi
223-
cmd_status=$?
224-
set -e
225-
record_done "$cmd_hash" "$now" "$cmd_status"
251+
fi
252+
253+
# nothing to do, just exit
254+
if [ -n "${DRYRUN:-}" ]; then
255+
echo "${cmd_and_args[*]}"
256+
exit 0
257+
fi
258+
259+
260+
set_db # DBFILE=sqlite location (PNOTEDB, git root, or PWD)
261+
record_start "$cmd_hash" "$started" "${cmd_and_args[*]}"
262+
record_in "$cmd_hash" "$started" "${in_times[@]}"
263+
record_out "$cmd_hash" "$started" "${_OUT[@]}"
264+
265+
if [ -n "$STDOUT" ]; then
266+
set +e # allow this to fail
267+
"${cmd_and_args[@]}" > "$STDOUT"
268+
else
269+
set +e # allow this to fail
270+
"${cmd_and_args[@]}"
271+
fi
272+
cmd_status=$?
273+
set -e
274+
record_done "$cmd_hash" "$started" "$cmd_status"
275+
276+
# side cars
277+
for out in "${_OUT[@]}"; do
278+
test -z "$out" && continue
279+
! test -r "$out" && echo "# WARNING: '$out' expected but not made!" >&2 &&
280+
continue
281+
write_sidecar "$out" "${cmd_and_args[*]}" "${_IN[@]}" || :
282+
done
283+
}
226284

227-
# TODO: check all output files exist
285+
eval "$(iffmain pnote_main)"

0 commit comments

Comments
 (0)