• 【WDL】7. 实践:GATK calling变异(人类)


    功能

    使用BWA + GATK进行变异检测的最佳实践流程,且优化为按染色体切分,并行进行变异检测和BQSR步骤,加快分析进度。

    流程图

    Graph.png

    input.json

    {
        "wgs.apply_bqsr.cpu": 32,
        "wgs.apply_bqsr.disks": "local-disk 250 cloud_ssd",
        "wgs.apply_bqsr.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.apply_bqsr.java_opts": "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'",
        "wgs.apply_bqsr.memory": "64G",
        "wgs.base_recalibrator.cpu": 16,
        "wgs.base_recalibrator.disks": "local-disk 250 cloud_ssd",
        "wgs.base_recalibrator.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.base_recalibrator.java_opts": "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'",
        "wgs.base_recalibrator.memory": "32G",
        "wgs.bwa_align.bwa_opts": "-K 10000000 -M -Y",
        "wgs.bwa_align.bwa_release": "/usr/local/bin/bwakit-0.7.15/",
        "wgs.bwa_align.cpu": 32,
        "wgs.bwa_align.disks": "local-disk 250 cloud_ssd",
        "wgs.bwa_align.memory": "64G",
        "wgs.bwa_align.samtools_release": "/usr/local/bin/samtools-1.9/",
        "wgs.dedup.cpu": 16,
        "wgs.dedup.create_index": true,
        "wgs.dedup.disks": "local-disk 250 cloud_ssd",
        "wgs.dedup.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.dedup.java_opts": "'-Xms4000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'",
        "wgs.dedup.memory": "32G",
        "wgs.dedup.remove_duplicates": true,
        "wgs.docker_img": "call_variants:1.0",
        "wgs.fastq1": "/path/to/reads_1.fastq",
        "wgs.fastq2": "/path/to/reads_2.fastq",
        "wgs.gather_bam_files.cpu": 16,
        "wgs.gather_bam_files.create_index": true,
        "wgs.gather_bam_files.disks": "local-disk 250 cloud_ssd",
        "wgs.gather_bam_files.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.gather_bam_files.java_opts": "'-Xms3000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'",
        "wgs.gather_bam_files.memory": "32G",
        "wgs.gather_bqsr_report.cpu": 16,
        "wgs.gather_bqsr_report.disks": "local-disk 250 cloud_ssd",
        "wgs.gather_bqsr_report.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.gather_bqsr_report.java_opts": "'-Xms3000m -Djava.io.tmpdir=/mnt'",
        "wgs.gather_bqsr_report.memory": "32G",
        "wgs.genome_dict": "/path/to/Homo_sapiens_assembly38.dict",
        "wgs.genome_indexes": [
            "/path/to/Homo_sapiens_assembly38.fasta",
            "/path/to/Homo_sapiens_assembly38.fasta.64.alt",
            "/path/to//Homo_sapiens_assembly38.fasta.64.amb",
            "/path/to/Homo_sapiens_assembly38.fasta.64.ann",
            "/path/to/Homo_sapiens_assembly38.fasta.64.bwt",
            "/path/to/Homo_sapiens_assembly38.fasta.64.pac",
            "/path/to/Homo_sapiens_assembly38.fasta.64.sa",
            "/path/to/Homo_sapiens_assembly38.dict",
            "/path/to/Homo_sapiens_assembly38.fasta.fai"
        ],
        "wgs.genotype_gvcfs.cpu": 16,
        "wgs.genotype_gvcfs.disks": "local-disk 250 cloud_ssd",
        "wgs.genotype_gvcfs.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.genotype_gvcfs.java_opts": "'-Xms4000m -Djava.io.tmpdir=/mnt'",
        "wgs.genotype_gvcfs.memory": "32G",
        "wgs.haplotype_caller.cpu": 32,
        "wgs.haplotype_caller.disks": "local-disk 250 cloud_ssd",
        "wgs.haplotype_caller.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.haplotype_caller.java_opts": "'-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Djava.io.tmpdir=/mnt'",
        "wgs.haplotype_caller.memory": "64G",
        "wgs.known_1000GIndels": [
            "/path/to/Homo_sapiens_assembly38.known_indels.vcf.gz",
            "/path/to/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
        ],
        "wgs.known_Mills": [
            "/path/to/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
            "/path/to/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
        ],
        "wgs.known_dbsnp": [
            "/path/to/Homo_sapiens_assembly38.dbsnp138.vcf",
            "/path/to/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
        ],
        "wgs.merge_vcfs.cpu": 16,
        "wgs.merge_vcfs.disks": "local-disk 250 cloud_ssd",
        "wgs.merge_vcfs.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
        "wgs.merge_vcfs.java_opts": "'-Xms2000m -Djava.io.tmpdir=/mnt'",
        "wgs.merge_vcfs.memory": "32G",
        "wgs.reads_group": "'@RG\\tID:GROUP_NAME\\tSM:SAMPLE_NAME\\tPL:PLATFORM'",
        "wgs.sample_id": "test",
        "wgs.wgs_calling_regions": "/path/to/wgs_calling_regions.hg38.interval_list"
    }
    

    interval_list用于并行,需要与基因组版本配套。

    Intervals and interval lists

    WDL

    version 1.0
    
    workflow wgs {
    
        input {
            String sample_id
            String reads_group
            File fastq1
            File fastq2
            Array[File] genome_indexes
            File genome_dict
            File wgs_calling_regions    
            Array[File] known_dbsnp
            Array[File] known_Mills
            Array[File] known_1000GIndels
            String docker_img = "call_variants:1.0"
        }
    
        call bwa_align {
            input: 
                sample_id=sample_id,
                reads_group=reads_group,
                fastq1=fastq1,
                fastq2=fastq2,
                genome_indexes=genome_indexes,
                docker_img=docker_img
        }
    
        call dedup {
            input:
                sample_id=sample_id,
                sorted_bam = bwa_align.sorted_bam_output,
                docker_img = docker_img
        }
    
        call gen_bqsr_scatter {
            input:
                genome_dict=genome_dict
        }
    
        scatter (sequence_group_intervals in gen_bqsr_scatter.sequence_grouping) {
            call base_recalibrator {
                input:
                    grouping=sequence_group_intervals,
                    sample_id=sample_id,
                    dedup_bam=dedup.dedup_bam_output,
                    genome_indexes=genome_indexes,
                    known_dbsnp=known_dbsnp,
                    known_1000GIndels=known_1000GIndels,
                    known_Mills=known_Mills,
                    docker_img=docker_img
            }
        }
    
        call gather_bqsr_report {
            input:
                recalibration_reports=base_recalibrator.recalibration_report_output,
                sample_id=sample_id,
                docker_img=docker_img
        }
    
        scatter (subgroup in gen_bqsr_scatter.sequence_grouping_with_unmapped) {
            call apply_bqsr {
                input:
                    sample_id=sample_id,
                    calling_region=subgroup,
                    dedup_bam=dedup.dedup_bam_output,
                    genome_indexes=genome_indexes,
                    bqsr_grp=gather_bqsr_report.bqsr_grp_output,
                    docker_img=docker_img
            }
        }
    
        call gather_bam_files {
            input:
                sample_id=sample_id,
                separate_recalibrated_bams = apply_bqsr.separate_recalibrated_bam,
                docker_img=docker_img
        }
    
        call gen_hc_scatter {
            input:
                wgs_calling_regions=wgs_calling_regions
        }
    
        scatter (intervals in gen_hc_scatter.scatter_interval_list) {
            call haplotype_caller {
                input:
                    sample_id=sample_id,
                    input_bam=gather_bam_files.recalibrated_bam_output,
                    calling_region=intervals,
                    genome_indexes=genome_indexes,
                    known_dbsnp=known_dbsnp,
                    docker_img=docker_img
            }
        }
    
        call merge_vcfs {
            input:
                sample_id=sample_id,
                docker_img=docker_img,
                vcfs=haplotype_caller.hc_block_gvcf
        }
    
        call genotype_gvcfs {
            input: 
                sample_id=sample_id,
                genome_indexes=genome_indexes,
                gvcf=merge_vcfs.gvcf_output,
                known_dbsnp=known_dbsnp,
                docker_img=docker_img
        }
    
        output {
            Pair[File,File] output_bam = gather_bam_files.recalibrated_bam_output
            Pair[File,File] output_gvcf = merge_vcfs.gvcf_output
            File output_vcf = genotype_gvcfs.vcf_output
        }
    
    }
    
    
    task bwa_align {
    
        input {
            # reads pair
            File fastq1
            File fastq2
    
            # pre-built genome index files
            Array[File] genome_indexes
    
            # experiments info
            String reads_group
            String sample_id
    
            # bwa parameters
            String bwa_release = "/usr/local/bin/bwakit-0.7.15/"
            String samtools_release = "/usr/local/bin/samtools-1.9/"
            String bwa_opts = "-K 10000000 -M -Y"
        
            # Resource
            Int cpu = 32
            String memory = "64G"
            String disks = "local-disk 250 cloud_ssd"
    
            # docker image
            String docker_img
        }
    
        String sorted_bam = sample_id + ".sorted.bam"
        String sorted_bam_index = sorted_bam + ".bai"
    
        command <<<
            cpu_cores=$(nproc)
            
            ~{bwa_release}/bwa mem \
            ~{bwa_opts} \
            -t $cpu_cores \
            -R ~{reads_group} \
            ~{genome_indexes[0]} \
            ~{fastq1} ~{fastq2} \
            | ~{samtools_release}/samtools sort -@ $cpu_cores -o ~{sorted_bam}
    
            ~{samtools_release}/samtools index ~{sorted_bam}
        >>>
    
        runtime {
            cpu: cpu
            memory: memory
            disks: disks
            docker: docker_img
        }
    
        output {
            Pair[File, File] sorted_bam_output = (sorted_bam, sorted_bam_index)
        }
    }
    
    task dedup {
        input {
            String sample_id
            Pair[File, File] sorted_bam
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-Xms4000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'"
            Boolean remove_duplicates = true
            Boolean create_index = true
    
            # Resource
            Int cpu = 16
            String memory = "32G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img
        }
    
        String dedup_bam = sample_id + ".deduplicated.bam"
        String dedup_bam_index = sample_id + ".deduplicated.bai"
        String dedup_metrics = sample_id + ".deduplicated.metrics"
    
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            MarkDuplicates \
            -I ~{sorted_bam.left} \
            -O ~{dedup_bam} \
            -M ~{dedup_metrics} \
            ~{true="--REMOVE_DUPLICATES true" false="" remove_duplicates} \
            ~{true="--CREATE_INDEX true" false="" create_index}
        >>>
    
        runtime {
            cpu: cpu
            memory: memory
            disks: disks
            docker: docker_img
        }
    
        output {
            Pair[File,File] dedup_bam_output = (dedup_bam, dedup_bam_index)
            File dedup_metrics_output = dedup_metrics
        }
    }
    
    # Split the whole genome for BQSR scatter-gather
    task gen_bqsr_scatter {
    
        input {
            File genome_dict
        }
        
        ## 不建议在command中写非shell脚本
        command <<<
            python - ~{genome_dict} << EOF 
            import sys
            with open(sys.argv[1], "r") as ref_dict_file:
                sequence_tuple_list = []
                longest_sequence = 0
                for line in ref_dict_file:
                    if line.startswith("@SQ"):
                        line_split = line.split("\t")
                            # (Sequence_Name, Sequence_Length)
                        sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
                longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
            # We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
            # the last element after a :, so we add this as a sacrificial element.
            hg38_protection_tag = ":1+"
            # initialize the tsv string with the first sequence
            tsv_string = "-L " + sequence_tuple_list[0][0] + hg38_protection_tag
            temp_size = sequence_tuple_list[0][1]
            for sequence_tuple in sequence_tuple_list[1:]:
                if temp_size + sequence_tuple[1] <= longest_sequence:
                    temp_size += sequence_tuple[1]
                    tsv_string += " " + "-L " + sequence_tuple[0] + hg38_protection_tag
                else:
                    tsv_string += "\n" + "-L " + sequence_tuple[0] + hg38_protection_tag
                    temp_size = sequence_tuple[1]
    
            # add the unmapped sequences as a separate line to ensure that they are recalibrated as well
            with open("sequence_grouping.txt", "w") as tsv_file:
                tsv_file.write(tsv_string)
                tsv_file.write("\n")
                tsv_file.close()
            tsv_string += '\n' + "-L unmapped\n"
    
            with open("sequence_grouping_with_unmapped.txt","w") as tsv_file_with_unmapped:
                tsv_file_with_unmapped.write(tsv_string)
                tsv_file_with_unmapped.close()
            EOF
        >>>
    
    
        output {
            Array[String] sequence_grouping = read_lines("sequence_grouping.txt")
            Array[String] sequence_grouping_with_unmapped = read_lines("sequence_grouping_with_unmapped.txt")
        }
    }
    
    
    
    task base_recalibrator {
        input {
    
            String grouping    
            Pair[File,File] dedup_bam
            Array[File] genome_indexes
            Array[File] known_1000GIndels
            Array[File] known_Mills
            Array[File] known_dbsnp  
            String sample_id
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'"
    
            # Resource
            Int cpu = 16
            String memory = "32G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img
        }
    
        String recalibration_report = sample_id + "_bqsr.table"
    
        command <<<
    
            ~{gatk_Launcher} --java-options ~{java_opts} \
            BaseRecalibrator \
            -I ~{dedup_bam.left} \
            -R ~{genome_indexes[0]} \
            ~{"--known-sites " + known_1000GIndels[0]} \
            ~{"--known-sites " + known_Mills[0]} \
            ~{"--known-sites " + known_dbsnp[0]} \
            ~{grouping} \
            -O ~{recalibration_report}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            File recalibration_report_output = recalibration_report
        }
    
    }
    
    task gather_bqsr_report {
        input {
            Array[File] recalibration_reports
            String sample_id
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-Xms3000m -Djava.io.tmpdir=/mnt'"
    
            # Resource
            Int cpu = 16
            String memory = "32G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img
    
        }
    
        String bqsr_grp = sample_id + "-bqsr.grp"
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            GatherBQSRReports \
            -I ~{sep=" -I " recalibration_reports} \
            -O ~{bqsr_grp}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            File bqsr_grp_output = bqsr_grp
        }
    }
    
    task apply_bqsr {
        input {
            Pair[File,File] dedup_bam
            Array[File] genome_indexes
            String calling_region
            File bqsr_grp
            String sample_id
    
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'"
    
            # Resource
            Int cpu = 32
            String memory = "64G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img
    
        }
    
        String tmp_recalibrated_bam = sample_id + "_tmp_recaled.bam"
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            ApplyBQSR \
            -R ~{genome_indexes[0]} \
            -I ~{dedup_bam.left} \
            --bqsr-recal-file ~{bqsr_grp} \
            ~{calling_region} \
            -O ~{tmp_recalibrated_bam}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            File separate_recalibrated_bam = tmp_recalibrated_bam
        }
    }
    
    task gather_bam_files {
    
        input {
            Array[File] separate_recalibrated_bams
            String sample_id
            String gatk_Launcher="/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-Xms3000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'"
            Boolean create_index = true
    
            # Resource
            Int cpu = 16
            String memory = "32G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img
        }
    
        String recalibrated_bam = sample_id + ".recalibrated.bam"
        String recalibrated_bam_index = sample_id + ".recalibrated.bai"
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            GatherBamFiles \
            -I ~{sep=" -I " separate_recalibrated_bams} \
            ~{true="--CREATE_INDEX true " false = "" create_index} \
            -O ~{recalibrated_bam}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            Pair[File,File] recalibrated_bam_output = (recalibrated_bam, recalibrated_bam_index)
        }
    }
    
    
    # Break the calling interval_list into sub-intervals
    # Perform variant calling on the sub-intervals, and then gather the results
    
    task gen_hc_scatter {
    
        input {
            File wgs_calling_regions
        }
    
        command <<<
    
            awk 'BEGIN {
            prev_total = 0
            frag = 1
            container = ""
            } 
            { if ( $1 !~ /^@/ ) 
            {
                len = ($3 - $2 + 1)
                if ( prev_total + len < 324860607 ) {
                prev_total += len
                container = container sprintf("-L %s:%d-%d ", $1, $2, $3)
                }
                else {
                a1 = prev_total + len - 324860607
                a2 = 324860607 - prev_total
                if ( a1 > a2 ) { print container; container = sprintf("-L %s:%d-%d ", $1, $2, $3); prev_total = len}
                else { container = container sprintf("-L %s:%d-%d ", $1, $2, $3); print container; container = ""; prev_total = 0}
                frag += 1
                }
            }
            }
            END {
            if ( container ) { print container }
            }' ~{wgs_calling_regions} > "ScatterIntervalList.txt"
    
        >>>
    
        output {
            Array[String] scatter_interval_list = read_lines("ScatterIntervalList.txt")
        }
    }
    
    
    # Call variants in parallel over WGS calling intervals
    
    task haplotype_caller {
        input {
            String sample_id
            Pair[File,File] input_bam
            String calling_region 
            Array[File] genome_indexes      
            Array[File] known_dbsnp
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Djava.io.tmpdir=/mnt'"
            
            # Resource
            Int cpu = 32
            String memory = "64G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img    
    
    
        }
    
        String hc_gvcf = sample_id + "_hc.g.vcf.gz"    
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            HaplotypeCaller \
            -R ~{genome_indexes[0]} \
            -I ~{input_bam.left} \
            ~{calling_region} \
            --dbsnp ~{known_dbsnp[0]} \
            --ERC GVCF \
            -O ~{hc_gvcf}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            File hc_block_gvcf = hc_gvcf
        }
    }
    
    
    task merge_vcfs {
    
        input {
            String sample_id
            Array[File] vcfs
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-Xms2000m -Djava.io.tmpdir=/mnt'"
            
            # Resource
            Int cpu = 16
            String memory = "32G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img    
        }
    
    
        #test_hc.g.vcf.gz.tbi
        String gvcf = sample_id + ".g.vcf.gz"
        String gvcf_idx = sample_id + ".g.vcf.gz.tbi"
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            MergeVcfs \
            -I ~{sep=" -I " vcfs} \
            -O ~{gvcf}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            Pair[File, File] gvcf_output = (gvcf, gvcf_idx)
        }
    }
    
    
    task genotype_gvcfs {
        input {
            String sample_id
            Pair[File, File] gvcf      
            Array[File] genome_indexes
            Array[File] known_dbsnp
            String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
            String java_opts = "'-Xms4000m -Djava.io.tmpdir=/mnt'"
           
            # Resource
            Int cpu = 16
            String memory = "32G"
            String disks = "local-disk 250 cloud_ssd"
            
            # docker 
            String docker_img    }
    
        String vcf = sample_id + ".vcf.gz"
    
        command <<<
            ~{gatk_Launcher} --java-options ~{java_opts} \
            GenotypeGVCFs \
            -R ~{genome_indexes[0]} \
            --dbsnp ~{known_dbsnp[0]} \
            -V ~{gvcf.left} \
            -O ~{vcf}
        >>>
    
        runtime {
            cpu:cpu
            memory:memory
            disks:disks
            docker: docker_img
        }
    
        output {
            File vcf_output = vcf
        }
    
    }
    

    Referenced from aliyun

  • 相关阅读:
    数据文件对应的磁盘坏掉了,没有归档,没有备份
    Oracle OEM重建
    Verilog编码指南
    UART串口协议
    信号完整性以及串扰
    Perl-由报表转命令(展讯2015)
    论文-ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
    时序路径分析模式
    后端设计各种设计文件格式说明
    Verilog-小数分频器(1.5)实现(待写)
  • 原文地址:https://www.cnblogs.com/jessepeng/p/16270691.html
Copyright © 2020-2023  润新知