令d[i]为第i个样本数据,cnt为样本个数,经过化简可得
[ans=frac{sum(d[i]^2)}{cnt}-(frac{sum d[i]}{cnt})^2]
枚举每一种可能的三核苷酸,得到它出现的各个位置,假设当前出现了tot个,第i个的编号为a[i],经过化简可得
[cnt+=C_{tot}^2]
[sum d[i]+=sum (a[i+1]-a[i])i(tot-i)]
[sum (d[i]^2)+=totsum(a[i]^2)-(sum a[i])^2]
时间复杂度$O(n)$。
#include<cstdio> #include<cstring> #define N 100010 typedef long long ll; int T,n,i,j,tot[64],q[64][N];ll cnt,sumd,sumd2,s,s2;char a[N]; inline ll C2(ll x){return x*(x-1)/2;} inline ll sqr(ll x){return x*x;} double sqr(double x){return x*x;} double solve(){ scanf("%s",a+1);n=strlen(a+1); for(i=1;i<=n;i++){ if(a[i]=='A')a[i]=0; else if(a[i]=='G')a[i]=1; else if(a[i]=='C')a[i]=2; else a[i]=3; } for(cnt=sumd=sumd2=i=0;i<64;i++)tot[i]=0; for(i=1;i<n-1;i++)j=a[i]|(a[i+1]<<2)|(a[i+2]<<4),q[j][++tot[j]]=i; for(i=0;i<64;i++)if(tot[i]>=2){ cnt+=C2(tot[i]); for(j=1;j<tot[i];j++)sumd+=1LL*(q[i][j+1]-q[i][j])*j*(tot[i]-j); for(s=s2=0,j=1;j<=tot[i];j++)s+=q[i][j],s2+=sqr(1LL*q[i][j]); sumd2+=s2*tot[i]-sqr(s); } if(!cnt)return 0; return 1.0*sumd2/cnt-sqr(1.0*sumd/cnt); } int main(){ for(scanf("%d",&T);T--;printf("%.6f ",solve())); return 0; }