compbench/compbench.py at main · ram-compbio/compbench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
##############################################################################
# General benchmarking protocol                                              #
# Written February 2024, commented/updated November 2024                     #
# Important notes:                                                           #
# For provided mappings, CTD drugs/indications can be id'd by name or id     #
#   i.e. CANDO ID for drugs, MeSH code for indications                    #
# TTD IDs are non-unique so TTD drugs/indications are identified by name     #
#   i.e. "X-rays imaging" and "Herbicide" instead of "N.A." for both         #
#   i.e. "D04XVN" and "D00QDJ" are both "trametinib"
##############################################################################

import random, math

##############################################################################
# Drug-indication mapping creation                                           #
##############################################################################

def extract_drug_indic_mapping(filename, key='iname', val='dname', min_d=1):
  '''
  Input
    filename - str, path to file with drug-indication mapping
    key - str, "iid" will use indication ID, otherwise name
    val - str, "did" will use drug ID, otherwise name
    min_d - int, minimum # of drugs for included indics; default=1

  mapping file should be a tsv with 4 labelled columns:
    Drug ID  Drug Name  Indication Name  Indication ID
    no specific format for drug/indication ID, but should be unique
    drug name & indication name should also be unique, match to drug/indic id
    a drug may have multiple indications & vice versa

  Returns a tuple of 2 items:
    indic_to_drugs - dict of str:list, known drug-indication mapping
      each indication name/id (key) is mapped to 1+ drug name/ids (values)
    cmpd_set - set of str, all drug name/ids in the drug-indic mapping
  '''
  with open(filename, 'r') as f:
    lines = f.read().strip().split('\n')

  indic_to_drugs = {}
  cmpd_set = set()
  print('Creating dict translating %s to %s' % \
        ('indication ID' if key == 'iid' else 'indication name',\
         'drug IDs' if val == 'did' else 'drug names'))

  for line in lines[1:]:
    did, dname, iname, iid = line.split('\t')
    if key == 'iid':
      ikey = iid.title()
    else:
      ikey = iname.title()

    if val == 'did':
      dval = did.title()
    else:
      dval = dname.lower().replace(' ', '_')

    cmpd_set.add(dval)

    if ikey in indic_to_drugs:
      if dval not in indic_to_drugs[ikey]:
        indic_to_drugs[ikey].append(dval)
    else:
      indic_to_drugs[ikey] = [dval]

  if min_d > 1:
    for key in list(indic_to_drugs.keys()):
      if len(indic_to_drugs[key]) < min_d:
        del indic_to_drugs[key]

  return indic_to_drugs, cmpd_set

def split_indics(indic_to_drugs, num_splits, by='indics'):
  '''
  indic_to_drugs - dict, indic (str) : drugs (list of str) pairs
                   indication/drug pairs to be split
  num_splits - int, number of dicts to split pairs into
  by - str, indics or pairs
                   indics - even # indications, eg for Xfold validation
                   pairs - even # of indic-drug pairs, eg for leave-one-out

  Returns list of num_splits indic_to_drugs dicts
    if by == indics, splits will be even # indics +- one
    if by == pairs, splits will be as even as possible given indic lens
    note that indics will NEVER be divided into multiple splits
  '''
  splits = [{} for _ in range(num_splits)]
  if len(indic_to_drugs) < num_splits:
    print('Not enough indications to split into %d' % num_splits)
    print('Splitting into %d (# indications) instead' % len(indic_to_drugs))
    num_splits = len(indic_to_drugs)
  else:
    print('Splitting %d indications into %d splits by %s' %
          (len(indic_to_drugs), num_splits, \
           'indics' if by == 'indics' else 'pairs'))

  if by == 'indics':
    num_per = len(indic_to_drugs) // num_splits
    extra = len(indic_to_drugs) % num_splits

    indics = tuple(indic_to_drugs.keys())
    start = 0
    for i in range(num_splits):
      num_incl = num_per + (1 if i < extra else 0)

      for j in range(start, start + num_incl):
        splits[i][indics[j]] = indic_to_drugs[indics[j]][:]

      start += num_incl

  else:
    indics = list(indic_to_drugs.keys())
    indics.sort(key=lambda x: len(indic_to_drugs[x]))
    lens = {}
    for i in range(num_splits):
      key = indics[-1]
      splits[i][key] = indic_to_drugs[key][:]

      key_len = len(indic_to_drugs[key])
      if key_len in lens:
        lens[key_len].append(i)
      else:
        lens[key_len] = [i]
      indics.pop()

    min_len = 0
    while indics:
      if min_len not in lens:
        min_len += 1
      else:
        key = indics[-1]
        i = lens[min_len][-1]

        splits[i][key] = indic_to_drugs[key][:]
        key_len = len(indic_to_drugs[key])
        new_len = min_len + key_len

        if new_len in lens:
          lens[new_len].append(i)
        else:
          lens[new_len] = [i]

        indics.pop()
        lens[min_len].pop()
        if len(lens[min_len]) == 0:
          del lens[min_len]

    print('Final split lengths range from %d to %d pairs (%d fold diff)' % \
          (min(lens.keys()), max(lens.keys()), \
           max(lens.keys())//min(lens.keys())))

  return splits


##############################################################################
# Data-splitting generators                                                  #
##############################################################################

def leave_one_out(drug_indic_mapping):
  '''
  drug_indic_mapping - dict, indic (str) : drugs (list of str) pairs

  Returns generator object
    Yields indic (str), indic_drugs (tuple of str), left_out (tuple of str)
    Usage: for indic, drugs, left_out in leave_one_out(standard):
    Will iterate thru every indic:drug pair in the dict
    To exlude indic:drug pairs, remove them from the dict
  '''
  for indic in drug_indic_mapping:
    for drug in drug_indic_mapping[indic]:
      #print(drug, drug_indic_mapping[indic])
      indic_drugs = drug_indic_mapping[indic][:]
      indic_drugs.remove(drug)
      yield indic, tuple(indic_drugs), (drug,)

def strat_xfold_cross(drug_indic_mapping, folds=10, seed=0):
  '''
  drug_indic_mapping - dict, indic (str) : drugs (list of str) pairs
  folds - int, # of folds the indic:drug pairs will be divided into
                  default 10 (10-fold cross validation)

  Will divide into folds stratified by indic (indic roughly evenly distrib)
    Note for indics of len < folds, some folds will be empty

  Returns generator object
    Yields indic (str), indic_drugs (tuple of str), left_outs (tuple of str)
    Usage: for indic, drugs, left_out in strat_xfold_cross(standard):
    Will iterate thru each fold's left out in order per indic
      ie will go indic 1 fold 1, indic 1 fold 2, indic 1 fold 3, etc
  '''
  random.seed(0)

  for indic in drug_indic_mapping:

    drug_list = drug_indic_mapping[indic][:]
    drugs_per = len(drug_list) // folds
    extra = len(drug_list) % folds
    for _ in range(3):
      random.shuffle(drug_list)

    extra_folds = set(random.sample(range(folds), extra))

    start_i = 0
    for i in range(folds):

      num_incl = drugs_per + (1 if i in extra_folds else 0)
      left_outs = drug_list[start_i : start_i + num_incl]
      indic_drugs = drug_list[:start_i] + drug_list[start_i + num_incl:]

      start_i = start_i + num_incl

      yield indic, tuple(indic_drugs), tuple(left_outs)

def save_strat_xfold_cross(drug_indic_mapping, folds=10, seed=0, name=''):
  '''
  drug_indic_mapping - dict, indic (str) : drugs (list of str) pairs
  folds - int, # of folds the indic:drug pairs will be divided into
                  default 10 (10-fold cross validation)

  Will divide into folds stratified by indic (indic roughly evenly distrib)
    Note for indics of len < folds, some folds will be empty

  Creates file saving split folds w/filename based on fold #, seed, & name
    Format: Indic  Fold  In Indic  Left Out
    Indic    - str, indication name
    Fold     - int, fold of that indication the following are included in
    In Indic - comma-separated list of str, drugs considered "in" indication
    Left Out - comma-separated list of str, "new" drugs being assessed
  '''
  random.seed(seed)
  if not name:
    filename = '%dfold_cross_seed%d.tsv' % (folds, seed)
  else:
    filename = '%dfold_cross_seed%d_%s.tsv' % (folds, seed, name)

  with open(filename, 'w') as f:
    f.write('Indic\tFold\tIn Indic\tLeft Out\n')

  for indic in drug_indic_mapping:
    drug_list = drug_indic_mapping[indic][:]
    drugs_per = len(drug_list) // folds
    extra = len(drug_list) % folds
    for _ in range(3):
      random.shuffle(drug_list)

    extra_folds = set(random.sample(range(folds), extra))

    start_i = 0
    for i in range(folds):

      num_incl = drugs_per + (1 if i in extra_folds else 0)
      left_outs = drug_list[start_i : start_i + num_incl]
      indic_drugs = drug_list[:start_i] + drug_list[start_i + num_incl:]

      start_i = start_i + num_incl

      with open(filename, 'a') as f:
        f.write('%s\t%d\t%s\t%s\n' % (indic, i+1, \
                                      ','.join(indic_drugs),\
                                      ','.join(left_outs)))

def read_strat_xfold_cross(drug_indic_mapping, filename):
  '''
  drug_indic_mapping - dict, indic (str) : drugs (list of str) pairs
  filename - str, file with saved stratified folds

  Returns generator object
    Yields indic (str), indic_drugs (tuple of str), left_outs (tuple of str)
    Usage: for indic, drugs, left_out in strat_xfold_cross(standard):
    Will iterate thru left out cmpds in same order as input file
    Note: will skip indications not in drug_indic_mapping
  '''
  random.seed(0)

  with open(filename, 'r') as f:
    lines = f.read().strip().split('\n')

  for line in lines:
    sections = line.split('\t')
    if len(sections) == 3:
      indic, fold, indic_drugs = sections
      left_outs = ''
    else:
      indic, fold, indic_drugs, left_outs = sections
    indic = indic.title()

    if indic not in drug_indic_mapping:
      continue

    indic_drugs = tuple(indic_drugs.split(','))
    left_outs = tuple(left_outs.split(','))
    if len(left_outs) == 1 and left_outs[0] == '':
      left_outs = tuple()

    yield indic, tuple(indic_drugs), tuple(left_outs)


##############################################################################
# Scoring functions (rank, AUROC, NDCG)                                      #
##############################################################################

def get_rank(ranked_list, true_cmpds):
  '''
  ranked_list - list of str, cmpd names/ids in a specific predicted order
  true_cmpds - str or list of str, cmpd name/id(s) to be matched to ranks
    all true_cmpds should appear at least once in ranked_list
    (in this case, the list of withheld compounds)

  Returns list of int representing ranks of true_cmpds in ranked_list
  '''
  ranks = []
  if type(true_cmpds) == str:
    true_cmpds = [true_cmpds]

  for cmpd in true_cmpds:
    ranks.append(ranked_list.index(cmpd) + 1)

  return ranks

def get_auroc(filename, max_fpr=2.0):
  '''
  filename - str, name of the file from which AUROC will be calculated
    should be tsv with 5 columns: Indic  Split  Cmpd  Rank  OutOf
    (file generated using benchmarking function)
  max_fpr (optional) - float, max FPR for AUROC to be calculated up through
    any number >= 1 or no number will calculate full AUROC

  Prints and returns area under receiver operator curve metric for given data
  '''
  with open(filename, 'r') as f:
    lines = f.read().strip().split('\n')

  if not lines[0].split('\t')[3].isdigit():
    lines = lines[1:]

  # Extract data (number of splits, # cmpds ranked per indic, ranks)
  splits = set()
  out_of_counts = {}
  ranks = []
  for line in lines:
    cells = line.split('\t')
    ranks.append(int(cells[3]))
    if (cells[0], cells[1]) in splits:
      continue
    else:
      splits.add((cells[0], cells[1]))
    key = int(cells[4])
    if key in out_of_counts:
      out_of_counts[key] += 1
    else:
      out_of_counts[key] = 1

  max_rank = max(out_of_counts.keys())
  num_splits = sum(out_of_counts.values())
  ranks.sort()

  # Count the number of approved & unapproved drugs at each rank
  pos = [0]*max_rank
  neg = [0]*max_rank
  splits_not_at_rank = 0
  j = 0
  for rank in range(1, max_rank + 1):
    i = rank - 1
    total = num_splits - splits_not_at_rank

    while j < len(ranks) and ranks[j] == rank:
      j += 1
      pos[i] += 1

    neg[i] = total - pos[i] + neg[i-1]
    pos[i] += pos[i-1]

    if rank in out_of_counts:
      splits_not_at_rank += out_of_counts[rank]

  neg = [x/neg[-1] for x in neg]
  pos = [x/pos[-1] for x in pos]

  # Calculate AUROC from TPR & FPR metrics at each rank
  total_area = 0
  for i in range(1,len(neg)):
    if neg[i] > max_fpr:
      run = max_fpr - neg[i-1]
      rise = ((pos[i] - pos[i-1])/(neg[i] - neg[i-1]))*run # slope times run
      total_area += (pos[i]*run) + ((rise*run)/2)
      break
    total_area += ((pos[i] + pos[i-1])/2)*(neg[i] - neg[i-1])

  print('AUROC', total_area, ('at max FPR ' + \
        str(max_fpr)) if max_fpr < 1 else '')

  return total_area

def write_auroc_graphable(filename, new_filename):
  '''
  filename - str, name of the file from which AUROC will be calculated
    should be tsv with 5 columns: Indic  Split  Cmpd  Rank  OutOf
    (file generated using benchmarking function)
  new_filename - str, name of output file to be created

  Creates TSV file named after new_filename with 3 columns:
      rank - int, rank threshold at which FPR/TPR was calculated
      FPR - float with 3 decimal places, false positive rate through rank
      TPR - float with 3 decimal places, true positive rate through rank
    Data can be used to create a ROC graph (FPR vs. TPR)
  '''
  with open(filename, 'r') as f:
    lines = f.read().strip().split('\n')

  if not lines[0].split('\t')[3].isdigit():
    lines = lines[1:]

  # Extract data (number of splits, # cmpds ranked per indic, ranks)
  splits = set()
  out_of_counts = {}
  ranks = []
  for line in lines:
    cells = line.split('\t')
    ranks.append(int(cells[3]))
    if (cells[0], cells[1]) in splits:
      continue
    else:
      splits.add((cells[0], cells[1]))
    key = int(cells[4])
    if key in out_of_counts:
      out_of_counts[key] += 1
    else:
      out_of_counts[key] = 1

  max_rank = max(out_of_counts.keys())
  num_splits = sum(out_of_counts.values())
  ranks.sort()

  # Count the number of approved & unapproved drugs at each rank
  pos = [0]*max_rank
  neg = [0]*max_rank
  splits_not_at_rank = 0
  j = 0
  for rank in range(1, max_rank + 1):
    i = rank - 1
    total = num_splits - splits_not_at_rank

    while j < len(ranks) and ranks[j] == rank:
      j += 1
      pos[i] += 1

    neg[i] = total - pos[i] + neg[i-1]
    pos[i] += pos[i-1]

    if rank in out_of_counts:
      splits_not_at_rank += out_of_counts[rank]

  neg = [x/neg[-1] for x in neg]
  pos = [x/pos[-1] for x in pos]

  out = ['Rank\tFPR\tTPR\n'] + \
        [('%d\t%.3f\t%.3f\n' % (i + 1, neg[i], pos[i])) for i in range(len(neg))]

  with open(new_filename, 'w') as f:
    f.writelines(out)


def get_ndcg(filename, rank_cutoff=None):
  '''
  filename - str, name of the file from which ndcg will be calculated
    should be tsv with 5 columns: Indic  Split  Cmpd  Rank  OutOf
    (file generated using benchmarking function)
  rank_cutoff  - int, max rank at or below which NDCG will be calculated
    None will result in no rank cutoff being used

  Prints and returns normalized discounted cumulative gain for given data
  '''
  with open(filename, 'r') as f:
    lines = f.read().strip().split('\n')

  if not lines[0].split('\t')[3].isdigit():
    lines = lines[1:]

  # Extract data (splits and ranks per split)
  splits = {}
  for line in lines:
    cells = line.split('\t')
    key = (cells[0], cells[1])
    if key in splits:
      splits[key].append(int(cells[3]))
    else:
      splits[key] = [int(cells[3])]

  if rank_cutoff == None:
    rank_cutoff = max([max(x) for x in splits.values()])

  # Calculate DCG and IDCG per split, then sum up
  dcg = 0
  idcg = 0
  for split, ranks in splits.items():
    ideals = list(range(1,len(ranks)+1))
    for rank, ideal in zip(ranks, ideals):
      if rank <= rank_cutoff:
        dcg += (1)/(math.log(rank + 1, 2))
      if ideal <= rank_cutoff:
        idcg += (1)/(math.log(ideal + 1, 2))

  print('NDCG', dcg/idcg, ('at cutoff ' + \
        str(rank_cutoff)) if rank_cutoff < max([max(x) for x in splits.values()]) else '')
  return dcg/idcg


##############################################################################
# Main benchmarking function                                                 #
##############################################################################

def benchmarking(drug_indic_mapping, cmpd_set, gen_func, bench_func,\
                 gen_func_args={}, bench_func_args={}, out_file='',
                 pass_indic_id=False,use_indic_name=False,use_drug_name=False):
  '''
  Primary benchmarking function; provides

  drug_indic_mapping - dict of str:list, known drug-indication mapping
    each indication name/id (key) is mapped to 1+ drug name/ids (values)
  cmpd_set - set of str, all drug name/ids in the drug-indic mapping
    compounds not in the drug-indication mappings may also be included
  gen_func - func, splitting function (from data-splitting generators)
    must take as input the drug-indication mapping (drug_indic_mapping)
      may take additional input stored in gen_func_args dictionary
    must yield indiv. indication name/ids, associated drugs, & withheld drug(s)
  bench_func - func, the function to be benchmarked
    generally a wrapper function that interfaces with the actual platform
    must take as input a set of indicated drugs and a set of non-inidcated drugs
      if pass_indic_id is True, must also take as input the indication name/id
      may take additional arguments stored in bench_func_args dictionary
    must output a sorted list of all compounds in non-indicated drug list
      sorting order: most likely to be effective for that indication to least
  gen_func_args (optional) - dict of str:any, additional args for gen_func
    ex: number of splits, file for splits to be read from
    to use read_strat_xfold_cross, would pass gen_func_args={'filename':name}
  bench_func_args (optional) - dict of str:any, additional args for bench_func
    ex: pre-initialized objects to be used (see cando_wrapper.py)
  pass_indic_id (optional) - bool, default False
    if True, bench_func will receive indication name/id as its third argument

  Passes following arguments to
  Creates results file in TSV format, 5 columns
    indic - str, indication name/id
    split - int, number of split in which association was assessed
    cmpd - str, compound name/id
    rank - int, the predicted rank of the withheld drug for the indication
    out of - int, the number of compounds the drug was ranked against
  '''

  print('Benchmarking...')

  if not out_file:
    out_file = 'unnamed_benchmarking.tsv'

  with open(out_file, 'w') as f:
    f.write('Indic\tSplit\tCmpd\tRank\tOut Of\n')

  indic_counters = {}
  for indic, drugs, left_outs in gen_func(drug_indic_mapping, **gen_func_args):
    # create indication based on drugs, assess ranks of left_outs
    if indic not in indic_counters:
      indic_counters[indic] = 1
    else:
      indic_counters[indic] += 1

    if len(left_outs) == 0:
      continue

    non_indic = cmpd_set - set(drugs)
    if pass_indic_id:
      ranked_list = bench_func(set(drugs), non_indic, indic,\
                               **bench_func_args)
    else:
      ranked_list = bench_func(set(drugs), non_indic, **bench_func_args)

    ranks = get_rank(ranked_list, left_outs)

    out = ''
    for left_drug, rank in zip(left_outs, ranks):
      out += '%s\t%d\t%s\t%d\t%d\n' % (indic, indic_counters[indic],\
                                           left_drug, rank, len(non_indic))

    with open(out_file, 'a') as f:
      f.write(out)

##############################################################################
# Test/sample code                                                           #
##############################################################################

def random_control(indic, non_indic, indic_id=None):
  '''
  Provide as an example of how prediction functions receive data

  indic - set of str, names or ids of all drugs associated with an indication
    (predictions should be made based on this set)
  non_indic - set of str, names/ids of all other, unassociated drugs
    (will contain 1+ withheld indication-associated drugs for assessment)
  indic_id (optional) - str, the id of the indication being assessed
    (can be used when the benchmarked function requires indication information)
    (will only be passed if pass_indic_id=True in benchmarking protocol)
  (Additional parameters, if necessary, may be passed through bench_func_args
    argument of benchmarking protocol)

  Returns ranks (list of str), ordered list of drug names/ids
    (Should contain all drugs from non_indic set and none from indic set)
    (Should be ordered most-least likely to be effective for this indication)
  '''
  ranks = list(non_indic)
  random.shuffle(ranks)
  return ranks

def alphabetical_test(indic, non_indic, indic_id=None):
  '''
  Serves as a deterministic test to check that this module is functioning as expecte

  indic - set of str, names or ids of all drugs associated with an indication
  non_indic - set of str, names/ids of all other, unassociated drugs
  indic_id (optional) - str, the id of the indication being assessed

  Returns ranks (list of str), alphabetically ordered list of drug names/ids
  '''
  ranks = list(non_indic)
  return sorted(non_indic)

if __name__ == '__main__':
  # Prepare the drug-indication mapping
  indic_drug_map, cmpd_set = \
                  extract_drug_indic_mapping('data/ctd_approved_drugs.tsv',\
                                   min_d=2,key='iid',val='did')

  # Benchmark on the given drug-indication mapping and training/testing splits
  benchmarking(indic_drug_map, cmpd_set, read_strat_xfold_cross, random_control,\
               {'filename':'data/id_10fold_cross_seed0_ctd_cnd.tsv'}, {}, 'randomized_results.tsv',\
               pass_indic_id=True)

  # Calculate AUROC and NDCG from the results; will print out to terminal
  get_auroc('randomized_results.tsv', max_fpr=0.05) # theoretically 0.00125
  get_auroc('randomized_results.tsv') # theoretically 0.5
  get_ndcg('randomized_results.tsv', rank_cutoff=10)
  get_ndcg('randomized_results.tsv')
  write_auroc_graphable('randomized_results.tsv','random_TPR_FPR_by_rank.tsv')
  print()

  benchmarking(indic_drug_map, cmpd_set, read_strat_xfold_cross, alphabetical_test,\
               {'filename':'data/id_10fold_cross_seed0_ctd_cnd.tsv'}, {}, 'alphabetical_results.tsv',\
               pass_indic_id=True)

  # Calculate AUROC and NDCG from the results; will print out to terminal
  get_auroc('deterministic_results.tsv', max_fpr=0.05) # theoretically 0.00125
  get_auroc('deterministic_results.tsv') # theoretically 0.5
  get_ndcg('deterministic_results.tsv', rank_cutoff=10)
  get_ndcg('deterministic_results.tsv')
  write_auroc_graphable('randomized_results.tsv','alphabetical_TPR_FPR_by_rank.tsv')