轻卡常NTT

普通版大概 2s 左右
目前洛谷上能稳定跑到1s，更快可能还要再卡

#include <queue>
#include <vector>
#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <cmath>
#define MP make_pair
#define ll long long
#define fi first
#define se second
using namespace std;

template <typename T>
void read(T &x) {
    x = 0; bool f = 0;
    char c = getchar();
    for (;!isdigit(c);c=getchar()) if (c=='-') f=1;
    for (;isdigit(c);c=getchar()) x=x*10+(c^48);
    if (f) x=-x;
}

template<typename F>
inline void write(F x, char ed = '
')
{
	static short st[30];short tp=0;
	if(x<0) putchar('-'),x=-x;
	do st[++tp]=x%10,x/=10; while(x);
	while(tp) putchar('0'|st[tp--]);
	putchar(ed);
}

template <typename T>
inline void Mx(T &x, T y) { x < y && (x = y); }

template <typename T>
inline void Mn(T &x, T y) { x > y && (x = y); }

#define op com operator 
#define con const
typedef double db;
const int N = 3000005;
const int P = 998244353;
int A[N], B[N]; ll E[N];
int r[N], lim = 1, L; 

inline int add(int x, int y) {
	return x + y >= P ? x + y - P : x + y;
}

void dft(int *A) {
	for (int i = 1;i < lim; i++)
		if (r[i] > i) swap(A[i], A[r[i]]);
	if (lim >= 2) 
	for (int j = 0;j < lim; j += 2) {
		int x = A[j], y = A[j+1];
		A[j] = add(x, y), A[j+1] = add(x, P - y);
	}
	if (lim >= 4)
	for (int j = 0;j < lim; j += 4) {
		int x = A[j], y = A[j+2];
		A[j] = add(x, y), A[j+2] = add(x , P - y);
		x = A[j+1], y = E[3] * A[j+3] % P;
		A[j+1] = add(x, y), A[j+3] = add(x , P - y);
	}
	if (lim >= 8)
	for (int j = 0;j < lim; j += 8) {
		int x = A[j], y = A[j+4];
		A[j] = add(x, y), A[j+4] = add(x , P - y);
		x = A[j+1], y = E[5] * A[j+5] % P;
		A[j+1] = add(x, y), A[j+5] = add(x , P - y);
		x = A[j+2], y = E[6] * A[j+6] % P;
		A[j+2] = add(x, y), A[j+6] = add(x , P - y);
		x = A[j+3], y = E[7] * A[j+7] % P;
		A[j+3] = add(x, y), A[j+7] = add(x , P - y);
	}
	for (int i = 8;i < lim; i <<= 1) {
		for (int j = 0;j < lim; j += (i << 1)) {
			int *f = A + j, *g = f + i; ll *e = E + i;
			for (int k = 0;k < i; k++) {
				int x = f[k], y = e[k] * g[k] % P;
				f[k] = add(x, y), g[k] = add(x , P - y);
				k++;
				x = f[k], y = e[k] * g[k] % P;
				f[k] = add(x, y), g[k] = add(x , P - y);
			}
		}
	}
}

ll fpw(ll x, ll mi) {
	ll res = 1;
	for (; mi; mi >>= 1, x = x * x % P)
		if (mi & 1) res = res * x % P;
	return res;
}

int m, n;
int main() {
	read(n), read(m);
	for (int i = 0;i <= n; i++) read(A[i]);
	for (int i = 0;i <= m; i++) read(B[i]);
	while (lim <= (n + m)) lim <<= 1, L++;
	int len = lim >> 1;
	for (int i = 1;i < lim; i++)
		r[i] = r[i >> 1] >> 1 | ((i & 1) ? len : 0);
	E[1] = 1;
	for (int i = 2;i < lim; i <<= 1) {
		ll *e0 = E + i / 2, *e1 = E + i;
		ll w = fpw(3, (P - 1) / (i << 1));
		for (int j = 0;j < i; j += 2) 
			e1[j] = e0[j>>1], e1[j+1] = e1[j] * w % P;
	}
	dft(A), dft(B);
	for (int i = 0;i < lim; i++) A[i] = (ll)A[i] * B[i] % P;
	dft(A); reverse(A + 1, A + lim); int inv = fpw(lim, P - 2);
	for (int i = 0;i <= n + m; i++) write(1ll * A[i] * inv % P, ' ');
	return 0;
}

下面是 fft

#include <queue>
#include <vector>
#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <cmath>
#define MP make_pair
#define ll long long
#define fi first
#define se second
using namespace std;

int read(void) {
    int x = 0; bool f = 0;
    char c = getchar();
    for (;!isdigit(c);c=getchar()) if (c=='-') f=1;
    for (;isdigit(c);c=getchar()) x=x*10+(c^48);
    if (f) x=-x; return x;
}

inline void write(int x, char ed = '
')
{
	static short st[30];short tp=0;
	if(x<0) putchar('-'),x=-x;
	do st[++tp]=x%10,x/=10; while(x);
	while(tp) putchar('0'|st[tp--]);
	putchar(ed);
}

template <typename T>
inline void Mx(T &x, T y) { x < y && (x = y); }

template <typename T>
inline void Mn(T &x, T y) { x > y && (x = y); }

#define op com operator 
#define con const
typedef double db;
const int N = 3000005;
const double Pi = acos(-1.0);
struct com {
	db x, y;
	com(db a = 0, db b = 0) : x(a) , y(b) {}
	op + (con com &w) con { return com(x + w.x, y + w.y); }
	op - (con com &w) con { return com(x - w.x, y - w.y); }
	op * (con com &w) con { return com(x * w.x - y * w.y, x * w.y + y * w.x); }
	op - (void) con { return com(-x, -y); }
	com mi() con { return com(-y, x); }
}A[N], E[N];

int r[N], lim = 1, L; 
void dft(com *A) {
	for (int i = 1;i < lim; i++)
		if (r[i] > i) swap(A[i], A[r[i]]);
	if (lim >= 2) 
	for (int j = 0;j < lim; j += 2) {
		com x = A[j], y = A[j+1];
		A[j] = x + y, A[j+1] = x - y;
	}
	if (lim >= 4)
	for (int j = 0;j < lim; j += 4) {
		com x = A[j], y = A[j+2];
		A[j] = x + y, A[j+2] = x - y;
		x = A[j+1], y = A[j+3].mi();
		A[j+1] = x + y, A[j+3] = x - y;
	}
	if (lim >= 8)
	for (int j = 0;j < lim; j += 8) {
		com x = A[j], y = A[j+4];
		A[j] = x + y, A[j+4] = x - y;
		x = A[j+1], y = A[j+5] * E[5];
		A[j+1] = x + y, A[j+5] = x - y;
		x = A[j+2], y = A[j+6].mi();
		A[j+2] = x + y, A[j+6] = x - y;
		x = A[j+3], y = A[j+7] * E[7];
		A[j+3] = x + y, A[j+7] = x - y;
	}
	for (int i = 8;i < lim; i <<= 1) {
		for (int j = 0;j < lim; j += (i << 1)) {
			com *f = A + j, *g = f + i, *e = E + i;
			for (int k = 0;k < i; k++) {
				com x = f[k], y = g[k] * e[k];
				f[k] = x + y, g[k] = x - y;
				k++;
				x = f[k], y = g[k] * e[k];
				f[k] = x + y, g[k] = x - y;
			}
		}
	}
}

int m, n;
int main() {
	n = read(), m = read();
	for (int i = 0;i <= n; i++) A[i].x = read();
	for (int i = 0;i <= m; i++) A[i].y = read();
	while (lim <= (n + m)) lim <<= 1, L++;
	int len = lim >> 1;
	for (int i = 1;i < lim; i++)
		r[i] = r[i >> 1] >> 1 | ((i & 1) ? len : 0);
	E[1] = com(1, 0);
	for (int i = 2;i < lim; i <<= 1) {
		com *e0 = E + i / 2, *e1 = E + i;
		com w(cos(Pi / i), sin(Pi / i));
		for (int j = 0;j < i; j += 2) 
			e1[j] = e0[j>>1], e1[j+1] = e1[j] * w;
	}
	dft(A);
	for (int i = 0;i < lim; i++) A[i] = A[i] * A[i];
	dft(A); reverse(A + 1, A + lim); lim *= 2;
	for (int i = 0;i <= n + m; i++)
		write((int)(A[i].y / lim + 0.5), ' ');
	return 0;
}

相关阅读:
文件操作小练习
 阶段练习1
copy小练习
 小练习
 str 小列题
 条款50：使用自定义的new以及delete的时机会
 条款49：了解new-handle行为
 简单的说一下：tarits技法就是一种模板元编程，起可以将本来处于运行期的事拉到编译期来做，增加了运行效率。看以非模板元编程的例子，就是前面的那个例子：
条款47：请使用traits class表示类型信息
 条款46：需要类型转换的时候请为模板定义非成员函数
原文地址：https://www.cnblogs.com/Hs-black/p/13414751.html