• python中统计人类基因组的外显子总长度(部分测试序列)


    001、方法1

    root@PC1:/home/test# ls
    a.txt  test.py
    root@PC1:/home/test# cat a.txt                                            ## 测试数据
    #chromosome     nc_accession    gene    gene_id ccds_id ccds_status     cds_strand      cds_from        cds_to  cds_locations   match_type
    1       NC_000001.8     LINC00115       79854   CCDS1.1 Withdrawn       -       801942  802433  [801942-802433] Identical
    1       NC_000001.11    SAMD11  148398  CCDS2.2 Public  +       925941  944152  [925941-926012, 930154-930335, 931038-931088, 935771-935895, 939039-939128, 939274-939459, 941143-941305, 942135-942250, 942409-942487, 942558-943057, 943252-943376, 943697-943807, 943907-944152]       Identical
    1       NC_000001.11    NOC2L   26155   CCDS3.1 Public  -       944693  959239  [944693-944799, 945056-945145, 945517-945652, 946172-946285, 946401-946544, 948130-948231, 948489-948602, 951126-951237, 951999-952138, 952411-952599, 953174-953287, 953781-953891, 954003-954081, 955922-956012, 956094-956214, 956893-957024, 957098-957272, 958928-959080, 959214-959239]     Identical
    1       NC_000001.11    PLEKHN1 84069   CCDS4.1 Public  +       966531  974574  [966531-966613, 966703-966802, 970276-970422, 970520-970600, 970685-970757, 970878-971005, 971112-971207, 971323-971403, 972074-972149, 972287-972423, 972860-973009, 973185-973325, 973499-973639, 973832-974050, 974315-974363, 974441-974574]  Identical
    root@PC1:/home/test# cat test.py                                          ## 测试程序
    #!/usr/bin/python
    in_file = open("a.txt", "r")
    dict1 = {}
    head = 1
    
    for i in in_file:
        if head:
            head -= 1
            continue
        temp = i.strip().split("\t")
        if temp[9].startswith("[") and temp[9].endswith("]"):
            temp = temp[9].lstrip("[").rstrip("]").split(", ")
            for j in range(len(temp)):
                key = temp[j].split('-')[0]
                dict1[key] = temp[j].split('-')[1]
    length = 0
    for i,j in dict1.items():
        length += ((int(j) - int(i) + 1))
    print(length)
    root@PC1:/home/test# python test.py                                   ## 运行程序
    6624

    002、方法2

    root@PC1:/home/test# cat a.txt                                        ## 测试数据
    #chromosome     nc_accession    gene    gene_id ccds_id ccds_status     cds_strand      cds_from        cds_to  cds_locations   match_type
    1       NC_000001.8     LINC00115       79854   CCDS1.1 Withdrawn       -       801942  802433  [801942-802433] Identical
    1       NC_000001.11    SAMD11  148398  CCDS2.2 Public  +       925941  944152  [925941-926012, 930154-930335, 931038-931088, 935771-935895, 939039-939128, 939274-939459, 941143-941305, 942135-942250, 942409-942487, 942558-943057, 943252-943376, 943697-943807, 943907-944152]       Identical
    1       NC_000001.11    NOC2L   26155   CCDS3.1 Public  -       944693  959239  [944693-944799, 945056-945145, 945517-945652, 946172-946285, 946401-946544, 948130-948231, 948489-948602, 951126-951237, 951999-952138, 952411-952599, 953174-953287, 953781-953891, 954003-954081, 955922-956012, 956094-956214, 956893-957024, 957098-957272, 958928-959080, 959214-959239]     Identical
    1       NC_000001.11    PLEKHN1 84069   CCDS4.1 Public  +       966531  974574  [966531-966613, 966703-966802, 970276-970422, 970520-970600, 970685-970757, 970878-971005, 971112-971207, 971323-971403, 972074-972149, 972287-972423, 972860-973009, 973185-973325, 973499-973639, 973832-974050, 974315-974363, 974441-974574]  Identical
    root@PC1:/home/test# cat test.py                                      ## 测试序列
    #!/usr/bin/pyton
    
    in_file = open("a.txt", "r")
    list1 = list()
    list2 = list()
    
    lines = in_file.readlines()[1:]
    
    for i in lines:
        i = i.strip().split("\t")
        if i[9].startswith("[") and i[9].endswith("]"):
            temp = i[9].replace("[", "").replace("]", "").split(", ")
            for j in temp:
                list1.append(j.split("-")[0])
                list2.append(j.split("-")[1])
    length = 0
    for i in range(len(list1)):
        length += (int(list2[i]) - int(list1[i]) +1)
    print(length)
    
    in_file.close()
    root@PC1:/home/test# python test.py     ## 执行程序
    6624

    003、shell验证

    root@PC1:/home/test# awk -F "\t" '{print $10}' a.txt | sed 1d | sed 's/\[\|\]//g' | sed 's/, /\n/g' | awk -F "-" '{sum += ($2 - $1 + 1)} END {print sum}'
    6624

    参考:https://www.jianshu.com/p/a7b20c2af042

  • 相关阅读:
    20182320《程序设计与数据结构》第八周学习总结
    20182320《程序设计与数据结构》第七周学习总结
    20182320 2019-2020-1 《数据结构与面向对象程序设计》实验6报告
    实验5
    20182320《程序设计与数据结构》第六周学习总结
    20182320 2019-2020-1 《数据结构与面向对象程序设计》第5周学习总结
    实验4
    20182320 2019-2020-1 《数据结构与面向对象程序设计》第4周学习总结
    实验3
    实验2报告
  • 原文地址:https://www.cnblogs.com/liujiaxin2018/p/16593382.html
Copyright © 2020-2023  润新知