#include <stddef.h>
int run_switches(const char *s, size_t n) {
int res = 0;
for (; n--; ++s)
res += (*s == 's') - (*s == 'p');
return res;
}
I got ~31GB/s in GCC and ~33GB/s in Clang. This is without any padding, or SIMD intrinsics, or any such nonsense. This is just untying the compiler's hands and giving it permission to do its job properly.Don't want to pass the string length? That's fine, we can figure that out for ourselves. This code:
#include <stddef.h>
#include <string.h>
int run_switches(const char *s) {
int res = 0;
for (size_t n = strlen(s); n--; ++s)
res += (*s == 's') - (*s == 'p');
return res;
}
Is 27GB/s. With a little bit of blocking: #include <stddef.h>
int run_switches(const char *s, size_t n) {
int res = 0;
char tmp = 0;
for (size_t i = n & 63; i--; ++s)
tmp += (*s == 's') - (*s == 'p');
res += tmp;
for (n >>= 6; n--;) {
tmp = 0;
for (size_t i = 64; i--; ++s)
tmp += (*s == 's') - (*s == 'p');
res += tmp;
}
return res;
}
That's ~55GB/s.Anyway, the point is, you're pretty far from the point where you ought to give up on C and dive into assembly.