Skip to content

Commit f360c62

Browse files
callachennaultsheridancbio
authored andcommitted
Preconsume archer-solid-cv4 and add fetch loop (#1129)
* Handle archer-solid-cv4 samples * Add loop * move each cohort to its own dir and fix filename
1 parent d554663 commit f360c62

2 files changed

Lines changed: 58 additions & 30 deletions

File tree

import-scripts/detect_samples_with_problematic_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def sample_missing_required_metadata(metadata, problematic_metadata):
3939
if metadata[required_field] is None:
4040
problematic_metadata["gene-panel"] = None
4141
return True
42-
if required_field == "gene-panel" and metadata[required_field].casefold() == "unknown":
42+
if required_field == "gene-panel" and metadata[required_field].casefold() in {"unknown", "archer-solid-cv4"}:
4343
problematic_metadata["gene-panel"] = metadata[required_field]
4444
return True
4545
return False

import-scripts/preconsume_problematic_samples.sh

Lines changed: 57 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/usr/bin/env bash
22

33
COHORT=$1
4-
TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples"
4+
FETCH_NUM=1
5+
TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples/${COHORT}"
56
CVR_FETCH_PROPERTIES_FILEPATH="/data/portal-cron/git-repos/pipelines-configuration/properties/fetch-cvr/application.properties"
67
CVR_USERNAME=$(grep 'dmp.user_name' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
78
CVR_PASSWORD=$(grep 'dmp.password' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
@@ -13,11 +14,11 @@ CVR_HEME_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_heme_variants"
1314
CVR_ARCHER_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_archer_retrieve_variants"
1415
CVR_ACCESS_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_access_variants"
1516
CVR_CONSUME_SAMPLE_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_consume_sample"
16-
FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${COHORT}.json"
17-
CONSUME_IDS_FILEPATH="$TMP_DIR/${COHORT}_consume.ids"
18-
PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume_${COHORT}.ids"
19-
PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume_${COHORT}.ids"
20-
CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output_${COHORT}.json"
17+
FETCH_OUTPUT_FILEPATH=""
18+
CONSUME_IDS_FILEPATH="$TMP_DIR/consume.ids"
19+
PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume.ids"
20+
PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume.ids"
21+
CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output.json"
2122
DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_null_dp_ad_fields.py
2223
DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_problematic_metadata.py
2324
CVR_MONITOR_SLACK_URI_FILE="/data/portal-cron/pipelines-credentials/cvr-monitor-webhook-uri"
@@ -29,6 +30,9 @@ function make_tmp_dir_if_necessary() {
2930
echo "Error : could not create tmp directory '$TMP_DIR'" >&2
3031
exit 1
3132
fi
33+
else
34+
# Remove files from last fetch
35+
rm $TMP_DIR/*
3236
fi
3337
}
3438

@@ -57,6 +61,7 @@ function set_cvr_fetch_url_prefix() {
5761
}
5862

5963
function fetch_currently_queued_samples() {
64+
FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${FETCH_NUM}.json"
6065
dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//')
6166
curl "${CVR_FETCH_URL_PREFIX}/${dmp_token}/0" > ${FETCH_OUTPUT_FILEPATH}
6267
}
@@ -69,10 +74,13 @@ function detect_samples_with_problematic_metadata() {
6974
$DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH ${FETCH_OUTPUT_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
7075
}
7176

72-
function exit_if_no_problems_detected() {
77+
function problems_were_detected() {
7378
if [ ! -s ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ] && [ ! -s ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} ] ; then
74-
echo "no problematic samples detected .. exiting"
75-
exit 0
79+
echo "no problematic samples detected"
80+
return 1
81+
else
82+
echo "problematic samples were detected"
83+
return 0
7684
fi
7785
}
7886

@@ -108,41 +116,57 @@ function attempt_to_consume_problematic_sample() {
108116
dmp_token="$1"
109117
sample_id="$2"
110118
type_of_problem="$3" # pass 'e' for event problems and 'm' for metadata problems
119+
register_attempt="$4"
111120
HTTP_STATUS=$(curl -sSL -w '%{http_code}' -o "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" "${CVR_CONSUME_SAMPLE_URL_PREFIX}/${dmp_token}/${sample_id}")
112121
if [[ $HTTP_STATUS =~ ^2 ]] ; then
113122
if ! grep '"error": "' "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" ; then
114123
if grep --silent 'affectedRows": 1' "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" ; then
115-
register_successful_consumption "${sample_id}" "$type_of_problem"
116-
continue
124+
if [ "$register_attempt" == true ] ; then
125+
register_successful_consumption "${sample_id}" "$type_of_problem"
126+
continue
127+
fi
117128
fi
118129
fi
119130
fi
120-
register_failed_consumption "${sample_id}" "$type_of_problem"
131+
if [ "$register_attempt" == true ] ; then
132+
register_failed_consumption "${sample_id}" "$type_of_problem"
133+
fi
121134
}
122135

123136
function attempt_to_consume_problematic_samples() {
137+
register_attempt=${1:-true}
124138
dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//')
125139
while read sample_id ; do
126-
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "e"
140+
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "e" "$register_attempt"
127141
done < ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
128142
while read sample_id ; do
129-
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "m"
143+
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "m" "$register_attempt"
130144
done < ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
131145
}
132146

133147
function consume_hardcoded_samples() {
134148
rm -f ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
135149
touch ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
136150
touch ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
137-
138151
if [ "$COHORT" == "mskimpact" ] ; then
139152
echo "P-0025907-N01-IM6" >> "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}"
140153
fi
141154
if [ -f "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}" ] ; then
142-
attempt_to_consume_problematic_samples
155+
# Won't register attempt (so it doesn't show up in logs every night)
156+
attempt_to_consume_problematic_samples false
143157
fi
144158
}
145159

160+
function need_to_log_actions {
161+
if [ ${#succeeded_to_consume_problematic_events_sample_list[@]} -gt 0 ] || \
162+
[ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ] || \
163+
[ ${#succeeded_to_consume_problematic_metadata_sample_list[@]} -gt 0 ] || \
164+
[ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ] ; then
165+
return 0
166+
fi
167+
return 1
168+
}
169+
146170
function log_actions() {
147171
date
148172
echo -e "${COHORT^^} Problematic Samples"
@@ -158,7 +182,7 @@ function post_slack_message() {
158182
if [ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ]; then
159183
MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_events_sample_list[*]}"
160184
fi
161-
MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
185+
MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset, invalid, or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
162186
if [ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ]; then
163187
MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_metadata_sample_list[*]}"
164188
fi
@@ -168,20 +192,24 @@ function post_slack_message() {
168192
date
169193
check_args
170194
make_tmp_dir_if_necessary
171-
failed_to_consume_problematic_events_sample_list=() # temporary code
172-
succeeded_to_consume_problematic_events_sample_list=() # temporary code
173-
failed_to_consume_problematic_metadata_sample_list=() # temporary code
174-
succeeded_to_consume_problematic_metadata_sample_list=() # temporary code
175195
set_cvr_fetch_url_prefix
176-
consume_hardcoded_samples # temporary code
177-
fetch_currently_queued_samples
178-
detect_samples_with_problematic_events
179-
detect_samples_with_problematic_metadata
180-
exit_if_no_problems_detected
181196
failed_to_consume_problematic_events_sample_list=()
182197
succeeded_to_consume_problematic_events_sample_list=()
183198
failed_to_consume_problematic_metadata_sample_list=()
184199
succeeded_to_consume_problematic_metadata_sample_list=()
185-
attempt_to_consume_problematic_samples
186-
log_actions
187-
post_slack_message
200+
while :
201+
do
202+
consume_hardcoded_samples # temporary code
203+
fetch_currently_queued_samples
204+
detect_samples_with_problematic_events
205+
detect_samples_with_problematic_metadata
206+
if ! problems_were_detected ; then
207+
break
208+
fi
209+
attempt_to_consume_problematic_samples
210+
((FETCH_NUM++))
211+
done
212+
if need_to_log_actions ; then
213+
log_actions
214+
post_slack_message
215+
fi

0 commit comments

Comments
 (0)