• 依据gff切fa并翻译为蛋白质


    #!/usr/bin/python
    import re
    import sys
    import gzip
    
    change={'A':'T','T':'A','C':'G','G':'C','N':'N'}
    
    CODE = {
        'GCA' : 'A', 'GCC' : 'A', 'GCG' : 'A', 'GCT' : 'A',                     
        'TGC' : 'C', 'TGT' : 'C',                                                           # Cysteine
        'GAC' : 'D', 'GAT' : 'D',                                                           # Aspartic Acid
        'GAA' : 'E', 'GAG' : 'E',                                                           # Glutamic Acid
        'TTC' : 'F', 'TTT' : 'F',                                                           # Phenylalanine
        'GGA' : 'G', 'GGC' : 'G', 'GGG' : 'G', 'GGT' : 'G',                               # Glycine
        'CAC' : 'H', 'CAT' : 'H',                                                           # Histidine
        'ATA' : 'I', 'ATC' : 'I', 'ATT' : 'I',                                             # Isoleucine
        'AAA' : 'K', 'AAG' : 'K',                                                           # Lysine
        'CTA' : 'L', 'CTC' : 'L', 'CTG' : 'L', 'CTT' : 'L', 'TTA' : 'L', 'TTG' : 'L',   # Leucine
        'ATG' : 'M',                                                                         # Methionine
        'AAC' : 'N', 'AAT' : 'N',                                                           # Asparagine
        'CCA' : 'P', 'CCC' : 'P', 'CCG' : 'P', 'CCT' : 'P',                               # Proline
        'CAA' : 'Q', 'CAG' : 'Q',                                                           # Glutamine
        'CGA' : 'R', 'CGC' : 'R', 'CGG' : 'R', 'CGT' : 'R', 'AGA' : 'R', 'AGG' : 'R',   # Arginine
        'TCA' : 'S', 'TCC' : 'S', 'TCG' : 'S', 'TCT' : 'S', 'AGC' : 'S', 'AGT' : 'S',   # Serine
        'ACA' : 'T', 'ACC' : 'T', 'ACG' : 'T', 'ACT' : 'T',                               # Threonine
        'GTA' : 'V', 'GTC' : 'V', 'GTG' : 'V', 'GTT' : 'V',                               # Valine
        'TGG' : 'W',                                                                         # Tryptophan
        'TAC' : 'Y', 'TAT' : 'Y',                                                           # Tyrosine
        'TAA' : '*', 'TAG' : '*', 'TGA' : '*'                                              # Stop
    }
    
    
    
    
    def readfa(l):
        col={}
        arr =[]
        sca =''
        li= gzip.open(l,'rb')
        for line in li:
            if '>' in line:
                arr =[]
                sca = line.split()[0].lstrip('>')
                col[sca]=arr
            else:
                without = re.sub(r'
    ',"",line)
                arr.append(without)
        return col
    
    
    
    def readgff(l):
        col ={}
        arr =[]
        li= gzip.open(l,'rb')
        for line in li:
            sp = line.split( )
            if sp[2] == 'mRNA':
                gene = re.match(r'ID=(.*?);',sp[8]).group(1)
                arr=[]
                col[gene]=[arr,sp[0],sp[6]]
    #            start=sp[3]
            elif sp[2] == 'CDS':
                gene = re.match(r'Parent=(.*?);',sp[8]).group(1)
                col[gene][0].append([int(sp[3])-1,int(sp[4])])
        return col
    #main###
    
    
    out= gzip.open(sys.argv[3],'wb')
    gff=readgff(sys.argv[2])
    c=gff
    s=''
    fa =readfa(sys.argv[1])
    
    
    for k1,v1 in gff.items():
        if v1[1] in fa.keys():
            lon=s.join(fa[v1[1]])
            short=''
            for i in v1[0]:
                short +=lon[i[0]:i[1]]
            if v1[2] == '-':
                short=''.join(change[i] for i in short)[::-1]
            j=0
            AA=''
            while j <= (len(short)-3):
                sp = short[j:3+j]
                if 'N' in sp:
                    j=j+3
                    continue
                else:
                    AA += CODE[sp]
                    j=j+3
            print >>out,">",k1,"
    ",AA
        else:
            print "no fa"
  • 相关阅读:
    TestNG 单元测试框架的使用
    HDU1255 覆盖的面积(线段树+扫描线)
    AcWing1169 糖果(差分约数)
    牛客 Treepath(树形dp)
    牛客 Shortest Path (dfs+思维)
    牛客 树(dfs序)
    牛客 城市网络(倍增)
    牛客 Borrow Classroom (LCA)
    CF710E Generate a String(dp)
    c#委托
  • 原文地址:https://www.cnblogs.com/yuanjingnan/p/11172734.html
Copyright © 2020-2023  润新知