之前用c写过一个程序,查找reads中是否包含了adaptor,如果检测到的话就过滤掉含有adaptor的reads,这次在过滤完数据之后发现接头序列比较多,为了提升组装效果,又不能很大地影响数据量,需要对接头进行截断处理,并过滤过短的reads,用python写了一个简短的程序,指定超过3个错配以内的匹配都认为匹配到,并且长度小于50bp的reads过滤,在以下程序基础上添加传入参数,可以适用比较多的情况(单端、双端、含有single等):
1 import sys 2 import re 3 from Bio import SeqIO 4 5 def rmPE(read1,read2,adaptor1,adaptor2,min_length): 6 res_1 = rmSE(read1,adaptor1,min_length) 7 res_2 = rmSE(read2,adaptor2,min_length) 8 if res_1 and res_2: 9 return res_1,res_2 10 else: 11 return False 12 13 def rmSE(read,adaptor,min_length): 14 seq = read.seq 15 seed_len = 6 16 a_len = len(adaptor) 17 seq_len = len(seq) 18 for i in range(a_len - seed_len): 19 seed = adaptor[i:i+seed_len] 20 pos = 0 21 while(pos < seq_len): 22 find_pos = seq.find(seed,pos) 23 if find_pos > 0: 24 mistaken_count = 0 25 _b = find_pos 26 _e = find_pos + seed_len 27 while(_b >= 0 and i >= find_pos - _b): 28 if adaptor[i - find_pos + _b] != seq[_b]: 29 mistaken_count += 1 30 if mistaken_count > 3: 31 break 32 _b -= 1 33 else : 34 while(_e < seq_len and i - find_pos + _e < a_len): 35 if adaptor[ i - find_pos + _e ] != seq[_e]: 36 mistaken_count += 1 37 if mistaken_count > 3: 38 break 39 _e += 1 40 else: 41 if find_pos - i > min_length: 42 return read[:find_pos-i] 43 else : 44 return False 45 pos = find_pos + 1 46 else: 47 break 48 return read 49 50 def rmAdaptor(argv): 51 argv.pop(0) 52 type = argv.pop(0) 53 if type=='PE': 54 read1_file,read2_file,adaptor1,adaptor2,out_prefix,min_length = argv 55 read2_records = SeqIO.parse(open(read2_file),'fastq') 56 read1_out = open( '%s.1.fq'%out_prefix,'w' ) 57 read2_out = open( '%s.2.fq'%out_prefix,'w' ) 58 for read1 in SeqIO.parse(open(read1_file),'fastq'): 59 read2 = read2_records.next() 60 rmPE_res = rmPE(read1,read2,adaptor1,adaptor2,min_length) 61 if rmPE_res: 62 read1_out.write(rmPE_res[0].format('fastq')) 63 read2_out.write(rmPE_res[1].format('fastq')) 64 elif type=='SE': 65 reads_file,adaptor,out_prefix,min_length = argv 66 reads_out = open( '%s.single.fq'%out_prefix,'w' ) 67 for reads in SeqIO.parse(open(reads_file),'fastq'): 68 rmSE_res = False 69 if re.search('[s/](d)',reads.id).group(1) == '1': 70 rmSE_res = rmSE(reads,adaptor1,min_length) 71 elif re.search('[s/](d)',reads.id).group(1) == '2': 72 rmSE_res = rmSE(reads,adaptor2,min_length) 73 if rmSE_res: 74 reads_out.write(rmSE_res.format('fastq')) 75 76 if __name__ == '__main__': 77 rmAdaptor(sys.argv)