• python 中统计fasta文件GC含量、总长度、总的GC含量


    001、

    root@PC1:/home/test# ls
    test.fasta  test.py
    root@PC1:/home/test# cat test.fasta             ## 测试文件
    >scaffold_1
    CCCGGGTAAAACGGGTCTTCAAGAAAACGCTCCTCCGTTAATGCCGGCCGATTCAAATAA
    CGCTGATTCTGATTCAGGATATACAATCTGACATGATGAACAGGTTTTCCAATTGGAATC
    CGTT
    >scaffold_2
    CACGCCGCCAGCGTTCGTCCTGAGCCAGGATCAAACTCTCCGATAAATGGATCACAGGTT
    AAGTTCACCGCATCCTGCGGCGACACCTGTGTGGCCTGCGTCGTGCAGGCCCTAGTTTGA
    >scaffold_3
    TTGATCCAGTGGCTCCGGTTACTCCAGTTGATCCTGTTGCGCCTGTTGCTCCAGTTTCTC
    CGGTTGGTCCGGTTGATCCGGTTGCACCTGTTACTCCAGTGGCTCCGGTTACTCCCGTCG
    CACCAGTTTCTCCTGTCGCACCAGTTGATCCTGTTGCGCCTGTTGGTCCTGTATCTCCAG
    >scaffold_4
    CCTGAGCCAGGATCAAACTCTCCGATA
    root@PC1:/home/test# cat test.py                ## 脚本
    #!/usr/bin/python
    
    import re
    in_file = open("test.fasta", "r")
    out_file = open("result.txt", "w")
    
    dict1 = dict()
    len_all = 0
    len_all_gc = 0
    
    for i in in_file:
        i = i.strip()
        if i[0] == ">":
            id_tem = i
            dict1[id_tem] = [0,0]
        else:
            len_line = len(i)
            dict1[id_tem][0] += len_line
            len_gc = len(re.findall('[GCgc]', i))
            dict1[id_tem][1] += len_gc
            len_all += len_line
            len_all_gc += len_gc
    
    print("id", "length","len_all_gc","percentage", file = out_file, sep = "\t")
    print("all", len_all, len_all_gc, len_all_gc / len_all, file = out_file, sep = "\t")
    
    
    for i,j in dict1.items():
        print(i, j[0], j[1], j[1] / j[0], file = out_file, sep = "\t")
    
    
    root@PC1:/home/test# python test.py        ## 执行脚本
    root@PC1:/home/test# ls
    result.txt  test.fasta  test.py
    root@PC1:/home/test# cat result.txt        ## 结果文件
    id      length  len_all_gc      percentage
    all     451     241     0.5343680709534369
    >scaffold_1     124     55      0.4435483870967742
    >scaffold_2     120     70      0.5833333333333334
    >scaffold_3     180     102     0.5666666666666667
    >scaffold_4     27      14      0.5185185185185185

    参考:https://mp.weixin.qq.com/s?__biz=MzIxNzc1Mzk3NQ==&mid=2247491476&idx=1&sn=c580bf5e497442599df8ede1d382ff23&chksm=97f5af8ca082269aa436c4fb9c40abd622c4bba68e749e0492bc83c7127959d0871344eb5960&scene=178&cur_album_id=2403674812188688386#rd

  • 相关阅读:
    Codeforces Round #107 (Div. 1) D Mission Impassable
    Codeforces Round #107 (Div. 1) C Smart Cheater
    Codeforces Round #104 (Div. 1) D Lucky Pair
    Codeforces Round #104 (Div. 1) C Lucky Subsequence
    拓扑排序&&欧拉(回)路
    复习笔记之矩阵快速幂(不定时更新)
    复习笔记之母函数
    树链剖分来一发
    最短路算法略解
    题目记录
  • 原文地址:https://www.cnblogs.com/liujiaxin2018/p/16560112.html
Copyright © 2020-2023  润新知