-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblastsumer_simple.py
More file actions
159 lines (141 loc) · 6.9 KB
/
blastsumer_simple.py
File metadata and controls
159 lines (141 loc) · 6.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# local_blast_zum.py
#
# Run on Python3
# Created by Alice on 2018-06-26.
#
import argparse
from os import listdir, remove, makedirs
from os.path import exists, isdir, isfile, join
from statistics import mean
from Bio.Blast.Applications import NcbiblastnCommandline
import xml.etree.ElementTree as ET
from Bio.Blast import NCBIXML
from match_db import Match_db
def main():
args = get_arguments()
result_file = "temp_out.xml"
database_path = "~/nematode_ref/nematodeDB"
#"~/Documents/larvkult_1508/nematodeDB"
if isdir(args.input):
files = [file for file in listdir(args.input)
if isfile(join(args.input, file)) and
file.split('.')[1]=="fa"]
elif isfile(args.input) and args.input.split(".")[-1]=="fa":
files = [args.input]
else:
raise NameError('Input file or directory does not exist or is not .fa format')
if args.verbose:
print("\n---- Loaded input files ----")
print(*files, sep='\n')
result_file_2 = "temp_out2.txt"
results = open(result_file_2, "w")
hits ={}
for file in files:
if args.verbose:
print("\nBlasting file: {}".format(file))
blastn_cmd=NcbiblastnCommandline(query=file, db=database_path,
max_target_seqs=10, gapopen=2, gapextend=3,
outfmt="5", out=result_file)
stdout, stderr = blastn_cmd()
print(stdout)
#Parse results
tree = ET.parse(result_file)
root = tree.getroot()
for read in root.iter('Iteration'):
if int(read.find('Iteration_iter-num').text)%100==0:
print("Read number: " + read.find('Iteration_iter-num').text)
read_name = read.find('Iteration_query-def').text
read_length = float(read.find('Iteration_query-len').text)
matches = {}
for hit in read.iter('Hit'):
hit_name = hit.find('Hit_def').text
hit_len = float(hit.find('Hit_len').text)
for hsp in hit.iter('Hsp'):
match_len = float(hsp.find('Hsp_identity').text)
perc_match = 0
if hit_len < read_length:
perc_match = round(match_len/hit_len, 4)
else:
perc_match = round(match_len/read_length, 4)
identity = float(hsp.find('Hsp_identity').text)
e_val = hsp.find('Hsp_evalue').text
alg_len = hsp.find('Hsp_align-len').text
pid = (identity/float(alg_len))*100
gaps = int(hsp.find('Hsp_gaps').text)
missmatch = int(float(alg_len) - gaps - identity)
matches[hit_name] = [read_name, hit_name.split(" ")[0], hit_name,
str(pid), e_val, str(alg_len), "0", "0", str(missmatch),
"0", str(gaps), str(perc_match*100)]
string_2_write = ""
if len(matches)>0:
best_hit = max(matches, key= lambda key: matches[key][-1])
#print("{}% {}".format(matches[best_hit][-1], best_hit))
string_2_write = matches[best_hit][0] + "\t" + matches[best_hit][1] \
+ "\t" + matches[best_hit][2] + "\t" + matches[best_hit][3] \
+ "\t" + matches[best_hit][4] + "\t" + matches[best_hit][5] \
+ "\t" + matches[best_hit][6] + "\t" + matches[best_hit][7] \
+ "\t" + matches[best_hit][8] + "\t" + matches[best_hit][9] \
+ "\t" + matches[best_hit][10] + "\t" + matches[best_hit][11] + "\n"
else:
string_2_write = read_name + "\tshortname\tlongname\t0.0\t1\t0\t0\t0\t0\t0\t0\t0\n"
#print("No hit found for read id: {}".format(read_name))
results.write(string_2_write)
results.close()
hits = summarize_blast_results(result_file_2, hits, args.pid, args.eval)
save_2_file(hits,args.output, args.verbose)
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("input", help="the input fasta files")
parser.add_argument("-v", "--verbose", help="print more info",
action="store_true")
parser.add_argument("-o", "--output", help="name of output file",
required=True)
parser.add_argument("--pid", help="Threshold of percentage identity of hits",
default=60.0, type=float)
parser.add_argument("--eval", help="Threshold of e-val of hits",
default=1e-10, type=float)
args = parser.parse_args()
if args.output.split(".")[-1]!='txt':
args.output += ".txt"
return args
def summarize_blast_results(results_file, hits, perc_id_thresh, e_val_thresh):
with open(results_file) as res_file:
for line in res_file:
query_res = line.split('\t')
short_name = query_res[1]
perc_id = float(query_res[3])
e_val = float(query_res[4])
alg_len=int(query_res[5])
if alg_len < 150 or perc_id < perc_id_thresh or e_val > e_val_thresh:
short_name = 'None'
query_res[2] = 'None'
if short_name in hits:
hits[short_name].add_read(read=query_res[0],
pid=perc_id, alg_len=int(query_res[5]),
e_val=e_val, missmatch=int(query_res[8]),
gaps=int(query_res[10]),
gaps_o=int(query_res[9]))
else:
hits[short_name] = Match_db(sn=short_name, name=query_res[2],
pid=perc_id, alg_len=int(query_res[5]),
e_val=e_val, missmatch=int(query_res[8]),
gaps=int(query_res[10]),
gaps_o=int(query_res[9]),
read=query_res[0])
res_file.close()
#remove(results_file)
return hits
def save_2_file(hits, output_file, verbosity):
tuple_hits = [ [match.name, match.count, mean(match.pid), mean(match.e_val),
mean(match.alg_len), mean(match.missmatch), mean(match.gaps),
mean(match.gap_openings)] for short_name, match in hits.items()]
sorted_hits = sorted(tuple_hits, key=lambda x: x[1], reverse=True)
sorted_hits_str = ["\t".join(list(map(str, hit)))+"\n" for hit in sorted_hits]
if verbosity:
print("\n---- Top 10 hits ----")
print(*sorted_hits_str[:10], sep='')
print("\nSaving results to: {}".format(output_file))
filehandle = open(output_file, "w")
filehandle.writelines(sorted_hits_str)
filehandle.close()
main()