• 怎么检测自己fastq的Phred类型 | phred33 phred64


     http://wiki.bits.vib.be/index.php/Identify_the_Phred_scale_of_quality_scores_used_in_fastQ

    #  S - Sanger        Phred+33,  raw reads typically (0, 40)
    #  X - Solexa        Solexa+64, raw reads typically (-5, 40)
    #  I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
    #  J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
    #  L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
    

    用别人的工具会比价靠谱,自己写容易出错,或考虑不周:

    BBMap as a little tool for this:

    $ testformat.sh in=N0174.fq.gz
    

    可以这么封装到流程里:

    fq1=SRR8501263_1.fastq.gz
    fq2=SRR8501263_2.fastq.gz
    
    /home/lizhixin/softwares/bbmap/testformat.sh in=$fq1 | cut -f 1 > fastq.format
    
    # phred=""
    for line in $(cat fastq.format); do phred=$line; done
    
    if [ $phred=="sanger" ]
    then
        echo "sanger"
    else
        echo "not sanger"
    fi
    

     

    直接用awk命令:

    这个对SRA转的fastq无法判断

    zcat A14_1.fastq.gz | head -100 | awk '{if(NR%4==0) printf("%s",$0);}' |  od -A n -t u1 | awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding!";}'
    

      

    一个Perl脚本

    #!/usr/bin/perl -w
    
    # http://wiki.bits.vib.be/index.php/Identify_the_Phred_scale_of_quality_scores_used_in_fastQ
    
    use strict;
    use File::Basename;
    use List::MoreUtils qw( minmax );
    
    # fastq_detect.pl fastq.file sample-size
    # detect fastQ format from quality scores in fastQ input file
    # Version 3
    #
    # Stephane Plaisance - VIB-BITS - July-04-2012
    # Joachim Jacob - Aug-02-2012 - joachim.jacob@gmail.com
    # - changed the maximum value of Sanger to 73
    # - changed reading the file with a file handle
    #   (was a file handle !! supporting several archive formats. SP)
    # - changed the diagnosing algoritm
    # Stephane Plaisance - VIB-BITS - April-08-2013
    # - merged both versions and corrected flaw in min/max
    # thanks to Sergey Mitrfanov for perl reformatting
    
    #####################################################################
    # diagnose
    #   SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
    #   ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
    #   ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
    #   .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
    #   LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
    #   !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~
    #   |                         |    |        |                              |                     |
    #  33                        59   64       73                            104                   126
    # S 0........................26...31.......40
    # X                          -5....0........9.............................40
    # I                                0........9.............................40
    # J                                   3.....9.............................40
    # L 0.2......................26...31........41
    #
    #  S - Sanger        Phred+33,  raw reads typically (0, 40)
    #  X - Solexa        Solexa+64, raw reads typically (-5, 40)
    #  I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
    #  J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
    #  L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
    #####################################################################
    
    my $script = basename($0);
    
    @ARGV gt 0 or die "usage: $script <fastq file> <opt:sample-size (100)>
    ";
    my ($inputfile, $limit) = @ARGV;
    if (! defined $limit) { $limit = 100}; # check first 100 records
    
    my $cnt=0;
    my ($min, $max); # global min and max values
    
    # print STDERR "
    ## Analysing ".$limit." records from $inputfile ... 
    ";
    my $z = ReadFile ($inputfile) || die "Error: cannot read from variant file $inputfile: $!
    ";
    
    ## parse
    while (my $id = <$z>) {
    	$id =~ m/^@/ || die "expected @ not found in line 1!
    ";
    	my $seq = <$z>;
    	my $sep = <$z>;
    	$sep =~ m/^+/ || die "expected + not found in line 3!
    ";
    	my $qual = <$z>;
    	chomp($qual);
    	$cnt++;
    	$cnt>=$limit && last;
    
    	# char to ascii
    	my @chars = split("", $qual);
    	my @nums = sort { $a <=> $b } (map { unpack("C*", $_ )} @chars);
    
    	if ($cnt==1) {
    		($min, $max) = minmax @nums;
    	} else {
    		my ($lmin, $lmax) = minmax @nums; # local values for this read
    		$lmin<$min ? $min=$lmin : $min=$min;
    		$lmax>$max ? $max=$lmax : $max=$max;
    	}
    }
    
    undef $z;
    
    ## diagnose
    my %diag=(
    			'Sanger'		=> '.',
    			'Solexa'		=> '.',
    			'Illumina 1.3+'	=> '.',
    			'Illumina 1.5+'	=> '.',
    			'Illumina 1.8+'	=> '.',
    			);
    
    my %comment=(
    			'Sanger'		=> 'Phred+33,  Q[33; 73],  (0, 40)',
    			'Solexa'		=> 'Solexa+64, Q[59; 104], (-5, 40)',
    			'Illumina 1.3+'	=> 'Phred+64,  Q[64; 104], (0, 40)',
    			'Illumina 1.5+'	=> 'Phred+64,  Q[66; 104], (3, 40), with 0=N/A, 1=N/A, 2=Read Segment Quality Control Indicator',
    			'Illumina 1.8+'	=> 'Phred+33,  Q[33; 74],  (0, 41)',
    			);
    
    if ($min<33 || $max>104) { die "Quality values corrupt. found [$min; $max] where [33; 104] was expected
    "; }
    if ($min>=33 && $max<=73)  {$diag{'Sanger'}='x';}
    if ($min>=59 && $max<=104) {$diag{'Solexa'}='x';}
    if ($min>=64 && $max<=104) {$diag{'Illumina 1.3+'}='x';}
    if ($min>=66 && $max<=104) {$diag{'Illumina 1.5+'}='x';}
    if ($min>=33 && $max<=74)  {$diag{'Illumina 1.8+'}='x';}
    
    ## report
    # print STDERR "# sampled raw quality values are in the range of [".$min."; ".$max."]
    ";
    # print STDERR "# format(s) marked below with 'x' agree with this range
    ";
    
    foreach my $format (sort keys %diag) {
    	#print STDERR sprintf("  %-13s : %2s  [%-30s] 
    ", $format, $diag{$format}, $comment{$format});
    	if ($diag{$format} eq "x") {print "$format
    "}
    }
    
    
    ##############
    #### Subs ####
    
    # reads from uncompressed, gzipped and bgzip fastQ files
    sub ReadFile {
    	my $infile = shift;
    	my $FH;
    	if ($infile =~ /.bz2$/) {
    		open ($FH, "bzcat $infile |") or die ("$!: can't open file $infile");
    	} elsif ($infile =~ /.gz$/) {
    		open ($FH, "zcat $infile |") or die ("$!: can't open file $infile");
    	} elsif ($infile =~ /.fq|.fastq|.txt$/) {
    		open ($FH, "cat $infile |") or die ("$!: can't open file $infile");
    	} else {
    		die ("$!: do not recognise file type $infile");
    	}
    	return $FH;
    }
    

      

  • 相关阅读:
    Centos 下oracle 11g 安装部署及手动建库过程
    MongoDB 存储引擎Wiredtiger原理剖析
    有关RDS上只读实例延时分析-同适用于自建MySQL主从延时分析判断
    windows 下my.ini的配置优化
    什么是purge操作
    linux内核调优参考
    通过第三方镜像仓库代理下载镜像
    微积分拾遗——链式法则
    Java中的RASP实现
    机器学习是什么
  • 原文地址:https://www.cnblogs.com/leezx/p/8657028.html
Copyright © 2020-2023  润新知