|
2 | 2 |
|
3 | 3 | import sys, argparse, sh |
4 | 4 |
|
5 | | -VERSION = "1.0.2" |
| 5 | +## A function to return the number of lines of a file |
| 6 | +def file_len(fname): |
| 7 | + with open(fname) as f: |
| 8 | + for i, l in enumerate(f): |
| 9 | + pass |
| 10 | + return i + 1 |
| 11 | + |
| 12 | +## A function to return the number of genotypes per line in a .geno file. |
| 13 | +def file_width(fname): |
| 14 | + with open(fname) as f: |
| 15 | + for i in f: |
| 16 | + return(len(i.strip())) |
| 17 | + break |
| 18 | + |
| 19 | +## Function to check the consistency of an eigenstrat database |
| 20 | +def validate_eigenstrat(genof, snpf, indf): |
| 21 | + dimsGeno = [file_len(genof), file_width(genof)] |
| 22 | + linesSnp = file_len(snpf) |
| 23 | + linesInd = file_len(indf) |
| 24 | + |
| 25 | + # print(dimsGeno,linesSnp,linesInd) |
| 26 | + ##Check geno and snp compatibility |
| 27 | + if dimsGeno[0] != linesSnp: |
| 28 | + raise IOError("Input .snp and .geno files do not match.") |
| 29 | + |
| 30 | + ##Check geno and ind compatibility |
| 31 | + if dimsGeno[1] != linesInd: |
| 32 | + raise IOError("Input .ind and .geno files do not match.") |
| 33 | + |
| 34 | +VERSION = "1.1.0" |
6 | 35 |
|
7 | 36 | parser = argparse.ArgumentParser(usage="%(prog)s (-i <Input file prefix>) (-c <input ind file> | -R | -E) [-L <SAMPLE LIST> | -S Ind [-S Ind2]] [-o <OUTPUT FILE PREFIX>]" , description="A tool to check two different EingenStrat databses for shared individuals, and extract or remove individuals from an EigenStrat database.") |
8 | 37 | parser._optionals.title = "Available options" |
9 | | -parser.add_argument("-i", "--Input", type=str, metavar="<INPUT FILES PREFIX>", required=True, help="The desired input file prefix. Input files are assumed to be <INPUT PREFIX>.geno, <INPUT PREFIX>.snp and <INPUT PREFIX>.ind .") |
| 38 | +parser.add_argument("-g", "--genoFn", type = str, metavar = "<GENO FILE NAME>", required = True, help = "The path to the input geno file.") |
| 39 | +parser.add_argument("-s", "--snpFn", type = str, metavar = "<SNP FILE NAME>", required = True, help = "The path to the input snp file.") |
| 40 | +parser.add_argument("-i", "--indFn", type = str, metavar = "<IND FILE NAME>", required = True, help = "The path to the input ind file.") |
10 | 41 | parser.add_argument("-o", "--Output", type=str, metavar="<OUTPUT FILES PREFIX>", required=False, help="The desired output file prefix. Three output files are created, <OUTPUT FILES PREFIX>.geno , <OUTPUT FILES PREFIX>.snp and <OUTPUT FILES PREFIX>.ind .") |
11 | | -parser.add_argument("-s", "--Suffix", type = str, metavar = "<INPUT FILE SUFFIX>", required = False, default = '', help = "The suffix (if any) that follows .geno/.snp/.ind in the input files. For example, specifying '-s .txt' will treat <INPUT PREFIX>.{geno,snp,ind}.txt as the desired input files.") |
12 | 42 | group = parser.add_mutually_exclusive_group(required=True) |
13 | 43 | group2 = parser.add_mutually_exclusive_group(required=False) |
14 | 44 | group.add_argument("-C", "--Check", type=argparse.FileType('r'), metavar="<INPUT FILE>", required=False, help="Check the -i .ind file and the second .ind file for duplicate individuals. Population assignment and/or individual sex are not checked, only individual names. Names are case sensitive.") |
|
20 | 50 | args = parser.parse_args() |
21 | 51 |
|
22 | 52 | if args.Extract is True or args.Remove is True: |
23 | | - if args.SampleList is None and args.Sample is None: |
24 | | - parser.error("Sample (-S) or SampleList (-L) required for -R/-E functions.") |
| 53 | + if args.SampleList is None and args.Sample is None: |
| 54 | + parser.error("Sample (-S) or SampleList (-L) required for -R/-E functions.") |
25 | 55 |
|
26 | 56 | if args.Extract is True or args.Remove is True: |
27 | | - if args.Output is None: |
28 | | - parser.error("Output files (-o) must be specified when using -R/-E functions") |
| 57 | + if args.Output is None: |
| 58 | + parser.error("Output files (-o) must be specified when using -R/-E functions") |
29 | 59 |
|
30 | | -IndFile = open(args.Input+".ind"+args.Suffix, "r") |
31 | | -GenoFile = open(args.Input+".geno"+args.Suffix, "r") |
32 | | -SnpFile = open(args.Input+".snp"+args.Suffix, "r") |
| 60 | +IndFile = open(args.indFn, "r") |
| 61 | +GenoFile = open(args.genoFn, "r") |
| 62 | +SnpFile = open(args.snpFn, "r") |
33 | 63 |
|
34 | 64 |
|
35 | 65 | #Check function |
36 | 66 | if args.Check != None: |
37 | | - Inds1 = [] |
38 | | - Inds2 = [] |
39 | | - c = 0 |
40 | | - for line in IndFile: |
41 | | - fields = line.strip().split() |
42 | | - Inds1.append(fields[0]) |
43 | | - |
44 | | - for line in args.Check: |
45 | | - fields = line.strip().split() |
46 | | - Inds2.append(fields[0]) |
47 | | - |
48 | | - for ind in Inds1: |
49 | | - if ind in Inds2: |
50 | | - print ("{:25s}".format(ind), "<---", "Duplicate individual", sep="\t", file=sys.stdout) |
51 | | - c+=1 |
52 | | - print ("#Duplicate individual check finished.", c, "duplicate individuals found.", sep=" ", file =sys.stdout) |
53 | | - sys.exit(0) |
54 | | - |
55 | | -#Check for errors in input files |
56 | | -##Check geno and snp compatibility |
57 | | -lineNo = "" |
58 | | -for line in sh.grep(sh.wc("-l", args.Input+".geno"+args.Suffix, args.Input+".snp"+args.Suffix), args.Input): |
59 | | - if lineNo=="": |
60 | | - lineNo=line.strip().split()[0] |
61 | | - elif lineNo==line.strip().split()[0]: |
62 | | - break |
63 | | - elif lineNo!=line.strip().split()[0]: |
64 | | - raise IOError("Input .snp and .geno files do not match.") |
65 | | - |
66 | | -##Check geno and ind compatibility |
67 | | -with open(args.Input+".geno"+args.Suffix, "r") as f: |
68 | | - for line in f: |
69 | | - if str(len(line.strip())) == sh.wc("-l", args.Input+".ind"+args.Suffix).strip().split()[0]: |
70 | | - break |
71 | | - else: |
72 | | - raise IOError("Input .ind and .geno files do not match.") |
| 67 | + Inds1 = [] |
| 68 | + Inds2 = [] |
| 69 | + c = 0 |
| 70 | + for line in IndFile: |
| 71 | + fields = line.strip().split() |
| 72 | + Inds1.append(fields[0]) |
| 73 | + |
| 74 | + for line in args.Check: |
| 75 | + fields = line.strip().split() |
| 76 | + Inds2.append(fields[0]) |
| 77 | + |
| 78 | + for ind in Inds1: |
| 79 | + if ind in Inds2: |
| 80 | + print ("{:25s}".format(ind), "<---", "Duplicate individual", sep="\t", file=sys.stdout) |
| 81 | + c+=1 |
| 82 | + print ("#Duplicate individual check finished.", c, "duplicate individuals found.", sep=" ", file =sys.stdout) |
| 83 | + sys.exit(0) |
| 84 | + |
| 85 | +## Perform checks on Eigenstrat dataset |
| 86 | +validate_eigenstrat(args.genoFn, args.snpFn, args.indFn) |
73 | 87 |
|
74 | 88 | #Read sample names into list of individuals of interest, from either input option |
75 | 89 | Samples = [] |
76 | 90 | if args.SampleList != None: |
77 | | - for line in args.SampleList: |
78 | | - fields = line.strip().split() |
79 | | - if fields[0][0]!="#": |
80 | | - Samples.append(fields[0]) |
| 91 | + for line in args.SampleList: |
| 92 | + fields = line.strip().split() |
| 93 | + if fields[0][0]!="#": |
| 94 | + Samples.append(fields[0]) |
81 | 95 |
|
82 | 96 | if args.Sample != None: |
83 | | - for i in args.Sample: |
84 | | - if i not in Samples: |
85 | | - Samples.append(i) |
| 97 | + for i in args.Sample: |
| 98 | + if i not in Samples: |
| 99 | + Samples.append(i) |
86 | 100 |
|
87 | 101 | if args.Remove == True: |
88 | | - print ("Detected", len(Samples), "individuals for REMOVAL.", sep=" ", file=sys.stderr) |
| 102 | + print ("Detected", len(Samples), "individuals for REMOVAL.", sep=" ", file=sys.stderr) |
89 | 103 | elif args.Extract == True: |
90 | | - print ("Detected", len(Samples), "individuals for EXTRACTION.", sep=" ", file=sys.stderr) |
| 104 | + print ("Detected", len(Samples), "individuals for EXTRACTION.", sep=" ", file=sys.stderr) |
91 | 105 |
|
92 | 106 | #Get sample index in geno and ind files. |
93 | 107 | Index = {} |
94 | 108 | Sex = {} |
95 | 109 | Pop = {} |
96 | 110 | for i in Samples: |
97 | | - for line in sh.grep(sh.cat("-n",args.Input+".ind"+args.Suffix), "{}".format(i),_ok_code=[0,1]): |
98 | | - fields=line.strip().split() |
99 | | - if fields[1] == i: |
100 | | - Index[fields[1]]=(int(fields[0]) -1) |
101 | | - Sex [fields[1]]=fields[2] |
102 | | - Pop [fields[1]]=fields[3] |
| 111 | + for line in sh.grep(sh.cat("-n",args.indFn), "{}".format(i),_ok_code=[0,1]): |
| 112 | + fields=line.strip().split() |
| 113 | + if fields[1] == i: |
| 114 | + Index[fields[1]]=(int(fields[0]) -1) |
| 115 | + Sex [fields[1]]=fields[2] |
| 116 | + Pop [fields[1]]=fields[3] |
103 | 117 |
|
104 | 118 | print ("Indexed", len(Index), "individuals.", sep =" ", file=sys.stderr) |
105 | 119 |
|
|
109 | 123 |
|
110 | 124 | #Extract function |
111 | 125 | if args.Extract == True: |
112 | | - for line in GenoFile: |
113 | | - fields=line.strip() |
114 | | - for i in Samples: |
115 | | - if i in Index: |
116 | | - print (fields[Index[i]], end="", file=GenoOutFile) |
117 | | - print("",file=GenoOutFile) |
118 | | - |
119 | | - for i in Samples: |
120 | | - try: print (i, Sex[i], Pop[i], sep="\t", file=IndOutFile) |
121 | | - except KeyError: |
122 | | - print ("Individual \"",i,"\" not found in ",args.Input+".ind"+args.Suffix+" file.", sep="", file=sys.stderr) |
123 | | - |
124 | | - for line in SnpFile: |
125 | | - print (line, end="", file =SnpOutFile) |
126 | | - print ("Extraction of ", len(Index)," individuals complete.", sep="", file=sys.stderr) |
127 | | - sys.exit(0) |
| 126 | + for line in GenoFile: |
| 127 | + fields=line.strip() |
| 128 | + for i in Samples: |
| 129 | + if i in Index: |
| 130 | + print (fields[Index[i]], end="", file=GenoOutFile) |
| 131 | + print("",file=GenoOutFile) |
| 132 | + |
| 133 | + for i in Samples: |
| 134 | + try: print (i, Sex[i], Pop[i], sep="\t", file=IndOutFile) |
| 135 | + except KeyError: |
| 136 | + print ("Individual \"",i,"\" not found in ",args.indFn," file.", sep="", file=sys.stderr) |
| 137 | + |
| 138 | + for line in SnpFile: |
| 139 | + print (line, end="", file =SnpOutFile) |
| 140 | + print ("Extraction of ", len(Index)," individuals complete.", sep="", file=sys.stderr) |
| 141 | + sys.exit(0) |
128 | 142 |
|
129 | 143 | #Remove function |
130 | 144 | sorted_indx = tuple(sorted(Index.values())) |
131 | 145 |
|
132 | 146 | if args.Remove == True: |
133 | | - for line in GenoFile: |
134 | | - count=0 |
135 | | - repeat=0 |
136 | | - fields=line.strip() |
137 | | - for i in range(len(sorted_indx)+1): |
138 | | - if repeat == len(sorted_indx): |
139 | | - print (fields[count:], end = "", file=GenoOutFile) |
140 | | - break |
141 | | - else: |
142 | | - print (fields[count:sorted_indx[i]], end="", file=GenoOutFile) |
143 | | - count=sorted_indx[i]+1 |
144 | | - repeat += 1 |
145 | | - print("",file=GenoOutFile) |
146 | | - |
147 | | - for line in IndFile: |
148 | | - fields=line.strip().split() |
149 | | - if fields[0] not in Samples: |
150 | | - print (line, end="", file=IndOutFile) |
151 | | - |
152 | | - |
153 | | - for line in SnpFile: |
154 | | - print (line, end="", file=SnpOutFile) |
155 | | - print ("Removal of ", len(Index)," individuals complete.", sep="", file=sys.stderr) |
156 | | - sys.exit(0) |
| 147 | + for line in GenoFile: |
| 148 | + count=0 |
| 149 | + repeat=0 |
| 150 | + fields=line.strip() |
| 151 | + for i in range(len(sorted_indx)+1): |
| 152 | + if repeat == len(sorted_indx): |
| 153 | + print (fields[count:], end = "", file=GenoOutFile) |
| 154 | + break |
| 155 | + else: |
| 156 | + print (fields[count:sorted_indx[i]], end="", file=GenoOutFile) |
| 157 | + count=sorted_indx[i]+1 |
| 158 | + repeat += 1 |
| 159 | + print("",file=GenoOutFile) |
| 160 | + |
| 161 | + for line in IndFile: |
| 162 | + fields=line.strip().split() |
| 163 | + if fields[0] not in Samples: |
| 164 | + print (line, end="", file=IndOutFile) |
| 165 | + |
| 166 | + |
| 167 | + for line in SnpFile: |
| 168 | + print (line, end="", file=SnpOutFile) |
| 169 | + print ("Removal of ", len(Index)," individuals complete.", sep="", file=sys.stderr) |
| 170 | + sys.exit(0) |
157 | 171 |
|
0 commit comments