001、方法1
root@PC1:/home/test# ls a.txt test.py root@PC1:/home/test# cat a.txt ## 测试数据 #chromosome nc_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type 1 NC_000001.8 LINC00115 79854 CCDS1.1 Withdrawn - 801942 802433 [801942-802433] Identical 1 NC_000001.11 SAMD11 148398 CCDS2.2 Public + 925941 944152 [925941-926012, 930154-930335, 931038-931088, 935771-935895, 939039-939128, 939274-939459, 941143-941305, 942135-942250, 942409-942487, 942558-943057, 943252-943376, 943697-943807, 943907-944152] Identical 1 NC_000001.11 NOC2L 26155 CCDS3.1 Public - 944693 959239 [944693-944799, 945056-945145, 945517-945652, 946172-946285, 946401-946544, 948130-948231, 948489-948602, 951126-951237, 951999-952138, 952411-952599, 953174-953287, 953781-953891, 954003-954081, 955922-956012, 956094-956214, 956893-957024, 957098-957272, 958928-959080, 959214-959239] Identical 1 NC_000001.11 PLEKHN1 84069 CCDS4.1 Public + 966531 974574 [966531-966613, 966703-966802, 970276-970422, 970520-970600, 970685-970757, 970878-971005, 971112-971207, 971323-971403, 972074-972149, 972287-972423, 972860-973009, 973185-973325, 973499-973639, 973832-974050, 974315-974363, 974441-974574] Identical root@PC1:/home/test# cat test.py ## 测试程序 #!/usr/bin/python in_file = open("a.txt", "r") dict1 = {} head = 1 for i in in_file: if head: head -= 1 continue temp = i.strip().split("\t") if temp[9].startswith("[") and temp[9].endswith("]"): temp = temp[9].lstrip("[").rstrip("]").split(", ") for j in range(len(temp)): key = temp[j].split('-')[0] dict1[key] = temp[j].split('-')[1] length = 0 for i,j in dict1.items(): length += ((int(j) - int(i) + 1)) print(length) root@PC1:/home/test# python test.py ## 运行程序 6624
002、方法2
root@PC1:/home/test# cat a.txt ## 测试数据 #chromosome nc_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type 1 NC_000001.8 LINC00115 79854 CCDS1.1 Withdrawn - 801942 802433 [801942-802433] Identical 1 NC_000001.11 SAMD11 148398 CCDS2.2 Public + 925941 944152 [925941-926012, 930154-930335, 931038-931088, 935771-935895, 939039-939128, 939274-939459, 941143-941305, 942135-942250, 942409-942487, 942558-943057, 943252-943376, 943697-943807, 943907-944152] Identical 1 NC_000001.11 NOC2L 26155 CCDS3.1 Public - 944693 959239 [944693-944799, 945056-945145, 945517-945652, 946172-946285, 946401-946544, 948130-948231, 948489-948602, 951126-951237, 951999-952138, 952411-952599, 953174-953287, 953781-953891, 954003-954081, 955922-956012, 956094-956214, 956893-957024, 957098-957272, 958928-959080, 959214-959239] Identical 1 NC_000001.11 PLEKHN1 84069 CCDS4.1 Public + 966531 974574 [966531-966613, 966703-966802, 970276-970422, 970520-970600, 970685-970757, 970878-971005, 971112-971207, 971323-971403, 972074-972149, 972287-972423, 972860-973009, 973185-973325, 973499-973639, 973832-974050, 974315-974363, 974441-974574] Identical root@PC1:/home/test# cat test.py ## 测试序列 #!/usr/bin/pyton in_file = open("a.txt", "r") list1 = list() list2 = list() lines = in_file.readlines()[1:] for i in lines: i = i.strip().split("\t") if i[9].startswith("[") and i[9].endswith("]"): temp = i[9].replace("[", "").replace("]", "").split(", ") for j in temp: list1.append(j.split("-")[0]) list2.append(j.split("-")[1]) length = 0 for i in range(len(list1)): length += (int(list2[i]) - int(list1[i]) +1) print(length) in_file.close() root@PC1:/home/test# python test.py ## 执行程序 6624
003、shell验证
root@PC1:/home/test# awk -F "\t" '{print $10}' a.txt | sed 1d | sed 's/\[\|\]//g' | sed 's/, /\n/g' | awk -F "-" '{sum += ($2 - $1 + 1)} END {print sum}' 6624
参考:https://www.jianshu.com/p/a7b20c2af042