• linux shell实现统计 位点缺失率


    1、脚本

    [root@centos79 test]# cat test.sh
    #!/bin/bash
    
    #step1 check ped file
    uniqn=$(sed 's/\r//g' $1 | cut -d " " -f 7- | sed 's/ /\n/g' | sort -u | wc -l)
    if [ $uniqn -gt 5 ]
    then
    echo "error, exception nucleotide: "
    sed 's/^M//g' $1 | cut -d " " -f 7- | sed 's/ /\n/g' | sort -u | grep -v [ATGC0]
    exit
    fi
    
    
    temp1=$(head -n 1 $1 | awk '{print NF}')
    for i in $(seq `sed -n "$=" $1`)
    do
    temp2=$(sed -n "$i"p $1 | awk '{print NF}')
    if [ $temp2 -ne $temp1 ]
    then
    echo "inconsistent number of columns!"
    echo -e "colun1 1: $temp1\ncolume $i: $temp2"
    exit
    fi
    done
    echo "step 1 done!"
    
    
    #step2 check consistence of ped and map file
    
    mapline=$(sed -n "$=" $2)
    locinum=$(head -n 1 $1 | awk '{print (NF - 6)/2}')
    
    if [ $mapline -ne $locinum ]
    then
    echo "error, map file and ped file do not match!"
    echo -e "mapline: $mapline\nlocinum: $locinum"
    exit
    fi
    echo "step 2 done!"
    
    #step 3 statistics
    lines=$(sed -n "$=" $1)
    for i in $(seq `head -n 1 $1 | awk '{print (NF - 6)/2}'`); do awk -v a=$i '{print $(a * 2 + 5), $(a * 2 + 6)}' $1 | awk -v a=$lines -v RS="@#$j" '{print gsub(/0/,"&")/(a*2)}' >> missresult1; done
    echo "step 3 done!"
    
    #step 4 add loci and headline
    cut -f 1-2,4 $2 | paste - missresult1 | sed '1i chr\tsnp\tpos\tmissing_rate' > missresult.txt
    rm -f  missresult1
    echo "fineshed!"

    2、测试数据及用法

    [root@centos79 test]# ls
    outcome.map  outcome.ped  test  test.sh
    [root@centos79 test]# cat outcome.map
    1       snp1    0       55910
    1       snp2    0       85204
    1       snp3    0       122948
    1       snp4    0       203750
    1       snp5    0       312707
    1       snp6    0       356863
    1       snp7    0       400518
    1       snp8    0       487423
    [root@centos79 test]# cat outcome.ped
    DOR 1 0 0 0 -9 G G 0 0 0 0 0 0 A A T T G G 0 0
    DOR 2 0 0 0 -9 0 0 G G 0 0 0 0 A A T T A G 0 0
    DOR 3 0 0 0 -9 0 0 C C 0 0 G G G G T T A G 0 0
    DOR 4 0 0 0 -9 0 0 C C 0 0 G G G G A A G G G G
    DOR 5 0 0 0 -9 G G C C 0 0 G G G G A A A G G C
    DOR 6 0 0 0 -9 G G C C 0 0 G G G G A A A A C C
    [root@centos79 test]# bash test.sh outcome.ped outcome.map
    step 1 done!
    step 2 done!
    step 3 done!
    fineshed!
    [root@centos79 test]# ls
    missresult.txt  outcome.map  outcome.ped  test  test.sh
    [root@centos79 test]# cat missresult.txt
    chr     snp     pos     missing_rate
    1       snp1    55910   0.5
    1       snp2    85204   0.166667
    1       snp3    122948  1
    1       snp4    203750  0.333333
    1       snp5    312707  0
    1       snp6    356863  0
    1       snp7    400518  0
    1       snp8    487423  0.5

    3、plink软件验证

    [root@centos79 test]# ls
    missresult.txt  outcome.map  outcome.ped  test  test.sh
    [root@centos79 test]# plink --file outcome --missing --out temp > /dev/null; rm *.log *.nosex
    [root@centos79 test]# ls
    missresult.txt  outcome.map  outcome.ped  temp.imiss  temp.lmiss  test  test.sh
    [root@centos79 test]# cat  temp.lmiss
     CHR  SNP   N_MISS   N_GENO   F_MISS
       1 snp1        3        6      0.5
       1 snp2        1        6   0.1667
       1 snp3        6        6        1
       1 snp4        2        6   0.3333
       1 snp5        0        6        0
       1 snp6        0        6        0
       1 snp7        0        6        0
       1 snp8        3        6      0.5
    [root@centos79 test]# cat missresult.txt
    chr     snp     pos     missing_rate
    1       snp1    55910   0.5
    1       snp2    85204   0.166667
    1       snp3    122948  1
    1       snp4    203750  0.333333
    1       snp5    312707  0
    1       snp6    356863  0
    1       snp7    400518  0
    1       snp8    487423  0.5

  • 相关阅读:
    努力的一分不会少
    C语言中的数组的访问方式
    代码的规划与规范化
    Manjaro中源码安装gcc7.1
    周围都是敌人,方法总比问题多
    elementary os 0.4.1下编译GCC-7.1源码并安装成功
    C语言中的基础知识变量探讨
    19年的桌面KDE的风雨和陪伴,没有什么能够割舍
    《手把手教你学C语言》学习笔记(10)--- 程序的循环控制
    copy 的实现原理与深浅拷贝
  • 原文地址:https://www.cnblogs.com/liujiaxin2018/p/15489116.html
Copyright © 2020-2023  润新知