直接上代码就好了,具体的介绍见 ouuan 的介绍 。有什么不会的都可以到这里来了解。
但是由于本题值域是 long long
,和 int
能压 8 位相比只能压 4 位,所以常数优化没有想象中的快。用 __m512i
类型压 8 位也没有速度上的改善。
然后很显然,想在 acwing 这种性能不太行的机子上过,必然是要火车头优化拉满的。
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-fhoist-adjacent-loads")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#include <stdio.h>
#include <immintrin.h>
#define getchar getchar_unlocked
#define putchar putchar_unlocked
#pragma GCC target("sse,sse2,sse3,ssse3,sse4.1,sse4.2,avx,avx2,popcnt,tune=native")
const int N = 100010;
typedef long long i64;
i64 rd()
{
i64 k = 0, f = 1;
char s = getchar();
while (s < '0' || s > '9')
{
if (s == '-')
f = 0;
s = getchar();
}
while (s >= '0' && s <= '9')
{
k = (k << 1) + (k << 3) + (s ^ '0');
s = getchar();
}
return f ? k : -k;
}
void wr(i64 x)
{
if (x < 0)
putchar('-'), x = -x;
if (x > 9)
wr(x / 10);
putchar((x % 10) ^ '0');
}
int rd_op()
{
char c = getchar();
while (c < 'A' || c > 'Z')
c = getchar();
return c == 'C';
}
int n, m;
int op, l, r;
// 0-indexed
i64* a;
__m256i A[N >> 2];
// [l, r)
void modify_add(int l, int r, i64 x)
{
while ((l & 3) && (l < r))
a[l++] += x;
if (!(l ^ r))
return;
while ((r & 3))
a[--r] += x;
if (!(l ^ r))
return;
__m256i t = _mm256_set1_epi64x(x);
for (l >>= 2, r >>= 2; l < r; ++l)
A[l] = _mm256_add_epi64(A[l], t);
}
i64 query(int l, int r)
{
i64 ret = 0;
while ((l & 3) && (l < r))
ret += a[l++];
if (!(l ^ r))
return ret;
while ((r & 3))
ret += a[--r];
if (!(l ^ r))
return ret;
__m256i ans = _mm256_set1_epi32(0);
for (l >>= 2, r >>= 2; l < r; ++l)
ans = _mm256_add_epi64(ans, A[l]);
for (int i = 0; i < 4; ++i)
ret += ans[i];
return ret;
}
int main()
{
n = rd(), m = rd(), a = (i64*)&A;
for (int i = 0; i < n; ++i)
a[i] = rd();
while (m--)
{
op = rd_op(), l = rd() - 1, r = rd();
if (op & 1)
modify_add(l, r, rd());
else
wr(query(l, r)), putchar('\n');
}
}