Skip to content

Commit 0c5a3aa

Browse files
committed
v1.1.0. explicit input file naming, not prefixes anymore.
1 parent 325649e commit 0c5a3aa

3 files changed

Lines changed: 123 additions & 108 deletions

File tree

eigenstrat_database_tools.py

Lines changed: 115 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,43 @@
22

33
import sys, argparse, sh
44

5-
VERSION = "1.0.2"
5+
## A function to return the number of lines of a file
6+
def file_len(fname):
7+
with open(fname) as f:
8+
for i, l in enumerate(f):
9+
pass
10+
return i + 1
11+
12+
## A function to return the number of genotypes per line in a .geno file.
13+
def file_width(fname):
14+
with open(fname) as f:
15+
for i in f:
16+
return(len(i.strip()))
17+
break
18+
19+
## Function to check the consistency of an eigenstrat database
20+
def validate_eigenstrat(genof, snpf, indf):
21+
dimsGeno = [file_len(genof), file_width(genof)]
22+
linesSnp = file_len(snpf)
23+
linesInd = file_len(indf)
24+
25+
# print(dimsGeno,linesSnp,linesInd)
26+
##Check geno and snp compatibility
27+
if dimsGeno[0] != linesSnp:
28+
raise IOError("Input .snp and .geno files do not match.")
29+
30+
##Check geno and ind compatibility
31+
if dimsGeno[1] != linesInd:
32+
raise IOError("Input .ind and .geno files do not match.")
33+
34+
VERSION = "1.1.0"
635

736
parser = argparse.ArgumentParser(usage="%(prog)s (-i <Input file prefix>) (-c <input ind file> | -R | -E) [-L <SAMPLE LIST> | -S Ind [-S Ind2]] [-o <OUTPUT FILE PREFIX>]" , description="A tool to check two different EingenStrat databses for shared individuals, and extract or remove individuals from an EigenStrat database.")
837
parser._optionals.title = "Available options"
9-
parser.add_argument("-i", "--Input", type=str, metavar="<INPUT FILES PREFIX>", required=True, help="The desired input file prefix. Input files are assumed to be <INPUT PREFIX>.geno, <INPUT PREFIX>.snp and <INPUT PREFIX>.ind .")
38+
parser.add_argument("-g", "--genoFn", type = str, metavar = "<GENO FILE NAME>", required = True, help = "The path to the input geno file.")
39+
parser.add_argument("-s", "--snpFn", type = str, metavar = "<SNP FILE NAME>", required = True, help = "The path to the input snp file.")
40+
parser.add_argument("-i", "--indFn", type = str, metavar = "<IND FILE NAME>", required = True, help = "The path to the input ind file.")
1041
parser.add_argument("-o", "--Output", type=str, metavar="<OUTPUT FILES PREFIX>", required=False, help="The desired output file prefix. Three output files are created, <OUTPUT FILES PREFIX>.geno , <OUTPUT FILES PREFIX>.snp and <OUTPUT FILES PREFIX>.ind .")
11-
parser.add_argument("-s", "--Suffix", type = str, metavar = "<INPUT FILE SUFFIX>", required = False, default = '', help = "The suffix (if any) that follows .geno/.snp/.ind in the input files. For example, specifying '-s .txt' will treat <INPUT PREFIX>.{geno,snp,ind}.txt as the desired input files.")
1242
group = parser.add_mutually_exclusive_group(required=True)
1343
group2 = parser.add_mutually_exclusive_group(required=False)
1444
group.add_argument("-C", "--Check", type=argparse.FileType('r'), metavar="<INPUT FILE>", required=False, help="Check the -i .ind file and the second .ind file for duplicate individuals. Population assignment and/or individual sex are not checked, only individual names. Names are case sensitive.")
@@ -20,86 +50,70 @@
2050
args = parser.parse_args()
2151

2252
if args.Extract is True or args.Remove is True:
23-
if args.SampleList is None and args.Sample is None:
24-
parser.error("Sample (-S) or SampleList (-L) required for -R/-E functions.")
53+
if args.SampleList is None and args.Sample is None:
54+
parser.error("Sample (-S) or SampleList (-L) required for -R/-E functions.")
2555

2656
if args.Extract is True or args.Remove is True:
27-
if args.Output is None:
28-
parser.error("Output files (-o) must be specified when using -R/-E functions")
57+
if args.Output is None:
58+
parser.error("Output files (-o) must be specified when using -R/-E functions")
2959

30-
IndFile = open(args.Input+".ind"+args.Suffix, "r")
31-
GenoFile = open(args.Input+".geno"+args.Suffix, "r")
32-
SnpFile = open(args.Input+".snp"+args.Suffix, "r")
60+
IndFile = open(args.indFn, "r")
61+
GenoFile = open(args.genoFn, "r")
62+
SnpFile = open(args.snpFn, "r")
3363

3464

3565
#Check function
3666
if args.Check != None:
37-
Inds1 = []
38-
Inds2 = []
39-
c = 0
40-
for line in IndFile:
41-
fields = line.strip().split()
42-
Inds1.append(fields[0])
43-
44-
for line in args.Check:
45-
fields = line.strip().split()
46-
Inds2.append(fields[0])
47-
48-
for ind in Inds1:
49-
if ind in Inds2:
50-
print ("{:25s}".format(ind), "<---", "Duplicate individual", sep="\t", file=sys.stdout)
51-
c+=1
52-
print ("#Duplicate individual check finished.", c, "duplicate individuals found.", sep=" ", file =sys.stdout)
53-
sys.exit(0)
54-
55-
#Check for errors in input files
56-
##Check geno and snp compatibility
57-
lineNo = ""
58-
for line in sh.grep(sh.wc("-l", args.Input+".geno"+args.Suffix, args.Input+".snp"+args.Suffix), args.Input):
59-
if lineNo=="":
60-
lineNo=line.strip().split()[0]
61-
elif lineNo==line.strip().split()[0]:
62-
break
63-
elif lineNo!=line.strip().split()[0]:
64-
raise IOError("Input .snp and .geno files do not match.")
65-
66-
##Check geno and ind compatibility
67-
with open(args.Input+".geno"+args.Suffix, "r") as f:
68-
for line in f:
69-
if str(len(line.strip())) == sh.wc("-l", args.Input+".ind"+args.Suffix).strip().split()[0]:
70-
break
71-
else:
72-
raise IOError("Input .ind and .geno files do not match.")
67+
Inds1 = []
68+
Inds2 = []
69+
c = 0
70+
for line in IndFile:
71+
fields = line.strip().split()
72+
Inds1.append(fields[0])
73+
74+
for line in args.Check:
75+
fields = line.strip().split()
76+
Inds2.append(fields[0])
77+
78+
for ind in Inds1:
79+
if ind in Inds2:
80+
print ("{:25s}".format(ind), "<---", "Duplicate individual", sep="\t", file=sys.stdout)
81+
c+=1
82+
print ("#Duplicate individual check finished.", c, "duplicate individuals found.", sep=" ", file =sys.stdout)
83+
sys.exit(0)
84+
85+
## Perform checks on Eigenstrat dataset
86+
validate_eigenstrat(args.genoFn, args.snpFn, args.indFn)
7387

7488
#Read sample names into list of individuals of interest, from either input option
7589
Samples = []
7690
if args.SampleList != None:
77-
for line in args.SampleList:
78-
fields = line.strip().split()
79-
if fields[0][0]!="#":
80-
Samples.append(fields[0])
91+
for line in args.SampleList:
92+
fields = line.strip().split()
93+
if fields[0][0]!="#":
94+
Samples.append(fields[0])
8195

8296
if args.Sample != None:
83-
for i in args.Sample:
84-
if i not in Samples:
85-
Samples.append(i)
97+
for i in args.Sample:
98+
if i not in Samples:
99+
Samples.append(i)
86100

87101
if args.Remove == True:
88-
print ("Detected", len(Samples), "individuals for REMOVAL.", sep=" ", file=sys.stderr)
102+
print ("Detected", len(Samples), "individuals for REMOVAL.", sep=" ", file=sys.stderr)
89103
elif args.Extract == True:
90-
print ("Detected", len(Samples), "individuals for EXTRACTION.", sep=" ", file=sys.stderr)
104+
print ("Detected", len(Samples), "individuals for EXTRACTION.", sep=" ", file=sys.stderr)
91105

92106
#Get sample index in geno and ind files.
93107
Index = {}
94108
Sex = {}
95109
Pop = {}
96110
for i in Samples:
97-
for line in sh.grep(sh.cat("-n",args.Input+".ind"+args.Suffix), "{}".format(i),_ok_code=[0,1]):
98-
fields=line.strip().split()
99-
if fields[1] == i:
100-
Index[fields[1]]=(int(fields[0]) -1)
101-
Sex [fields[1]]=fields[2]
102-
Pop [fields[1]]=fields[3]
111+
for line in sh.grep(sh.cat("-n",args.indFn), "{}".format(i),_ok_code=[0,1]):
112+
fields=line.strip().split()
113+
if fields[1] == i:
114+
Index[fields[1]]=(int(fields[0]) -1)
115+
Sex [fields[1]]=fields[2]
116+
Pop [fields[1]]=fields[3]
103117

104118
print ("Indexed", len(Index), "individuals.", sep =" ", file=sys.stderr)
105119

@@ -109,49 +123,49 @@
109123

110124
#Extract function
111125
if args.Extract == True:
112-
for line in GenoFile:
113-
fields=line.strip()
114-
for i in Samples:
115-
if i in Index:
116-
print (fields[Index[i]], end="", file=GenoOutFile)
117-
print("",file=GenoOutFile)
118-
119-
for i in Samples:
120-
try: print (i, Sex[i], Pop[i], sep="\t", file=IndOutFile)
121-
except KeyError:
122-
print ("Individual \"",i,"\" not found in ",args.Input+".ind"+args.Suffix+" file.", sep="", file=sys.stderr)
123-
124-
for line in SnpFile:
125-
print (line, end="", file =SnpOutFile)
126-
print ("Extraction of ", len(Index)," individuals complete.", sep="", file=sys.stderr)
127-
sys.exit(0)
126+
for line in GenoFile:
127+
fields=line.strip()
128+
for i in Samples:
129+
if i in Index:
130+
print (fields[Index[i]], end="", file=GenoOutFile)
131+
print("",file=GenoOutFile)
132+
133+
for i in Samples:
134+
try: print (i, Sex[i], Pop[i], sep="\t", file=IndOutFile)
135+
except KeyError:
136+
print ("Individual \"",i,"\" not found in ",args.indFn," file.", sep="", file=sys.stderr)
137+
138+
for line in SnpFile:
139+
print (line, end="", file =SnpOutFile)
140+
print ("Extraction of ", len(Index)," individuals complete.", sep="", file=sys.stderr)
141+
sys.exit(0)
128142

129143
#Remove function
130144
sorted_indx = tuple(sorted(Index.values()))
131145

132146
if args.Remove == True:
133-
for line in GenoFile:
134-
count=0
135-
repeat=0
136-
fields=line.strip()
137-
for i in range(len(sorted_indx)+1):
138-
if repeat == len(sorted_indx):
139-
print (fields[count:], end = "", file=GenoOutFile)
140-
break
141-
else:
142-
print (fields[count:sorted_indx[i]], end="", file=GenoOutFile)
143-
count=sorted_indx[i]+1
144-
repeat += 1
145-
print("",file=GenoOutFile)
146-
147-
for line in IndFile:
148-
fields=line.strip().split()
149-
if fields[0] not in Samples:
150-
print (line, end="", file=IndOutFile)
151-
152-
153-
for line in SnpFile:
154-
print (line, end="", file=SnpOutFile)
155-
print ("Removal of ", len(Index)," individuals complete.", sep="", file=sys.stderr)
156-
sys.exit(0)
147+
for line in GenoFile:
148+
count=0
149+
repeat=0
150+
fields=line.strip()
151+
for i in range(len(sorted_indx)+1):
152+
if repeat == len(sorted_indx):
153+
print (fields[count:], end = "", file=GenoOutFile)
154+
break
155+
else:
156+
print (fields[count:sorted_indx[i]], end="", file=GenoOutFile)
157+
count=sorted_indx[i]+1
158+
repeat += 1
159+
print("",file=GenoOutFile)
160+
161+
for line in IndFile:
162+
fields=line.strip().split()
163+
if fields[0] not in Samples:
164+
print (line, end="", file=IndOutFile)
165+
166+
167+
for line in SnpFile:
168+
print (line, end="", file=SnpOutFile)
169+
print ("Removal of ", len(Index)," individuals complete.", sep="", file=sys.stderr)
170+
sys.exit(0)
157171

eigenstrat_snp_coverage.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pandas as pd
55
from collections import OrderedDict
66

7-
VERSION = "1.0.2"
7+
VERSION = "1.1.0"
88

99
## A function to return the number of lines of a file
1010
def file_len(fname):
@@ -47,8 +47,9 @@ def get_ind_names(indf):
4747

4848
parser = argparse.ArgumentParser(usage = "%(prog)s (-i <Input file prefix>) [-s <Input file suffix>] [-o <Output filepath>] [-j | --json]" , description = "A tool to check two different EingenStrat databses for shared individuals, and extract or remove individuals from an EigenStrat database.")
4949
parser._optionals.title = "Available options"
50-
parser.add_argument("-i", "--Input", type = str, metavar = "<INPUT FILES PREFIX>", required = True, help = "The desired input file prefix. Input files are assumed to be <INPUT PREFIX>.geno, <INPUT PREFIX>.snp and <INPUT PREFIX>.ind .")
51-
parser.add_argument("-s", "--Suffix", type = str, metavar = "<INPUT FILE SUFFIX>", required = False, default = '', help = "The suffix (if any) that follows .geno/.snp/.ind in the input files. For example, specifying '-s .txt' will treat <INPUT PREFIX>.{geno,snp,ind}.txt as the desired input files.")
50+
parser.add_argument("-g", "--genoFn", type = str, metavar = "<GENO FILE NAME>", required = True, help = "The path to the input geno file.")
51+
parser.add_argument("-s", "--snpFn", type = str, metavar = "<SNP FILE NAME>", required = True, help = "The path to the input snp file.")
52+
parser.add_argument("-i", "--indFn", type = str, metavar = "<IND FILE NAME>", required = True, help = "The path to the input ind file.")
5253
parser.add_argument("-o", "--Output", type = str, metavar = "<OUTPUT FILEPATH>", required = False, help = "The filepath where the output table should be saved. Omit to print to stdout.")
5354
parser.add_argument("-j", "--json", type = str, metavar = "<JSON OUTPUT FILEPATH>", default = None, help = "Create additional json formatted output file named <JSON OUTPUT FILEPATH> .")
5455
parser.add_argument("-v", "--version", action='version', version="%(prog)s {}".format(VERSION), help="Print the version and exit.")
@@ -71,9 +72,9 @@ def get_ind_names(indf):
7172
data['Metadata'] = {'tool_name' : "EigenStratSnpCoverage.py", "version" : VERSION}
7273

7374
## Get input file paths
74-
genof = args.Input+".geno"+args.Suffix
75-
snpf = args.Input+".snp"+args.Suffix
76-
indf = args.Input+".ind"+args.Suffix
75+
genof = args.genoFn
76+
snpf = args.snpFn
77+
indf = args.indFn
7778

7879
## Perform checks on Eigenstrat dataset
7980
validate_eigenstrat(genof, snpf, indf)

rename_snps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python3
22
import sys,argparse
33

4-
VERSION='0.0.1'
4+
VERSION='1.1.0'
55

66
def read_line(line, format):
77
fields=line.strip().split()

0 commit comments

Comments
 (0)