Utilities/src/crc32c.cc

0001 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
0002  * Copyright (C) 2013 Mark Adler
0003  * Version 1.1  1 Aug 2013  Mark Adler
0004  */
0005
0006 /*
0007   This software is provided 'as-is', without any express or implied
0008   warranty.  In no event will the author be held liable for any damages
0009   arising from the use of this software.
0010
0011   Permission is granted to anyone to use this software for any purpose,
0012   including commercial applications, and to alter it and redistribute it
0013   freely, subject to the following restrictions:
0014
0015   1. The origin of this software must not be misrepresented; you must not
0016      claim that you wrote the original software. If you use this software
0017      in a product, an acknowledgment in the product documentation would be
0018      appreciated but is not required.
0019   2. Altered source versions must be plainly marked as such, and must not be
0020      misrepresented as being the original software.
0021   3. This notice may not be removed or altered from any source distribution.
0022
0023   Mark Adler
0024   madler@alumni.caltech.edu
0025  */
0026
0027 /* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
0028    CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
0029    version is provided as a fall-back, as well as for speed comparisons. */
0030
0031 /* Version history:
0032    1.0  10 Feb 2013  First version
0033    1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
0034  */
0035
0036 /* Srecko Morovic, Apr 02 2015: (crc32.cc) modified to compile in c++
0037  * and ifdefs to compile and call hw version only with X86_64
0038  */
0039
0040 #include <cstdio>
0041 #include <cstdlib>
0042 #include <cstdint>
0043 #include <unistd.h>
0044 #include <pthread.h>
0045
0046 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
0047 #define POLY 0x82f63b78
0048
0049 /* Table for a quadword-at-a-time software crc. */
0050 static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT;
0051 static uint32_t crc32c_table[8][256];
0052
0053 /* Construct table for software CRC-32C calculation. */
0054 static void crc32c_init_sw(void) {
0055   uint32_t n, crc, k;
0056
0057   for (n = 0; n < 256; n++) {
0058     crc = n;
0059     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0060     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0061     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0062     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0063     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0064     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0065     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0066     crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0067     crc32c_table[0][n] = crc;
0068   }
0069   for (n = 0; n < 256; n++) {
0070     crc = crc32c_table[0][n];
0071     for (k = 1; k < 8; k++) {
0072       crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
0073       crc32c_table[k][n] = crc;
0074     }
0075   }
0076 }
0077
0078 /* Table-driven software version as a fall-back.  This is about 15 times slower
0079    than using the hardware instructions.  This assumes little-endian integers,
0080    as is the case on Intel processors that the assembler code here is for. */
0081 static uint32_t crc32c_sw(uint32_t crci, const unsigned char *buf, size_t len) {
0082   const unsigned char *next = buf;
0083   uint64_t crc;
0084
0085   pthread_once(&crc32c_once_sw, crc32c_init_sw);
0086   crc = crci ^ 0xffffffff;
0087   while (len && ((const uintptr_t)next & 7) != 0) {
0088     crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
0089     len--;
0090   }
0091   while (len >= 8) {
0092     crc ^= *(uint64_t *)next;
0093     crc = crc32c_table[7][crc & 0xff] ^ crc32c_table[6][(crc >> 8) & 0xff] ^ crc32c_table[5][(crc >> 16) & 0xff] ^
0094           crc32c_table[4][(crc >> 24) & 0xff] ^ crc32c_table[3][(crc >> 32) & 0xff] ^
0095           crc32c_table[2][(crc >> 40) & 0xff] ^ crc32c_table[1][(crc >> 48) & 0xff] ^ crc32c_table[0][crc >> 56];
0096     next += 8;
0097     len -= 8;
0098   }
0099   while (len) {
0100     crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
0101     len--;
0102   }
0103   return (uint32_t)crc ^ 0xffffffff;
0104 }
0105
0106 #if defined(__x86_64__)
0107 /* Multiply a matrix times a vector over the Galois field of two elements,
0108    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
0109    least as many entries as the power of two for most significant one bit in
0110    vec. */
0111 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
0112   uint32_t sum;
0113
0114   sum = 0;
0115   while (vec) {
0116     if (vec & 1)
0117       sum ^= *mat;
0118     vec >>= 1;
0119     mat++;
0120   }
0121   return sum;
0122 }
0123
0124 /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
0125    rows. */
0126 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
0127   int n;
0128
0129   for (n = 0; n < 32; n++)
0130     square[n] = gf2_matrix_times(mat, mat[n]);
0131 }
0132
0133 /* Construct an operator to apply len zeros to a crc.  len must be a power of
0134    two.  If len is not a power of two, then the result is the same as for the
0135    largest power of two less than len.  The result for len == 0 is the same as
0136    for len == 1.  A version of this routine could be easily written for any
0137    len, but that is not needed for this application. */
0138 static void crc32c_zeros_op(uint32_t *even, size_t len) {
0139   int n;
0140   uint32_t row;
0141   uint32_t odd[32]; /* odd-power-of-two zeros operator */
0142
0143   /* put operator for one zero bit in odd */
0144   odd[0] = POLY; /* CRC-32C polynomial */
0145   row = 1;
0146   for (n = 1; n < 32; n++) {
0147     odd[n] = row;
0148     row <<= 1;
0149   }
0150
0151   /* put operator for two zero bits in even */
0152   gf2_matrix_square(even, odd);
0153
0154   /* put operator for four zero bits in odd */
0155   gf2_matrix_square(odd, even);
0156
0157   /* first square will put the operator for one zero byte (eight zero bits),
0158        in even -- next square puts operator for two zero bytes in odd, and so
0159        on, until len has been rotated down to zero */
0160   do {
0161     gf2_matrix_square(even, odd);
0162     len >>= 1;
0163     if (len == 0)
0164       return;
0165     gf2_matrix_square(odd, even);
0166     len >>= 1;
0167   } while (len);
0168
0169   /* answer ended up in odd -- copy to even */
0170   for (n = 0; n < 32; n++)
0171     even[n] = odd[n];
0172 }
0173
0174 /* Take a length and build four lookup tables for applying the zeros operator
0175    for that length, byte-by-byte on the operand. */
0176 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
0177   uint32_t n;
0178   uint32_t op[32];
0179
0180   crc32c_zeros_op(op, len);
0181   for (n = 0; n < 256; n++) {
0182     zeros[0][n] = gf2_matrix_times(op, n);
0183     zeros[1][n] = gf2_matrix_times(op, n << 8);
0184     zeros[2][n] = gf2_matrix_times(op, n << 16);
0185     zeros[3][n] = gf2_matrix_times(op, n << 24);
0186   }
0187 }
0188
0189 /* Apply the zeros operator table to crc. */
0190 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
0191   return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
0192 }
0193
0194 /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
0195    both be powers of two.  The associated string constants must be set
0196    accordingly, for use in constructing the assembler instructions. */
0197 #define LONG 8192
0198 #define LONGx1 "8192"
0199 #define LONGx2 "16384"
0200 #define SHORT 256
0201 #define SHORTx1 "256"
0202 #define SHORTx2 "512"
0203
0204 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
0205 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
0206 static uint32_t crc32c_long[4][256];
0207 static uint32_t crc32c_short[4][256];
0208
0209 /* Initialize tables for shifting crcs. */
0210 static void crc32c_init_hw(void) {
0211   crc32c_zeros(crc32c_long, LONG);
0212   crc32c_zeros(crc32c_short, SHORT);
0213 }
0214
0215 /* Compute CRC-32C using the Intel hardware instruction. */
0216 static uint32_t crc32c_hw(uint32_t crc, const unsigned char *buf, size_t len) {
0217   const unsigned char *next = buf;
0218   const unsigned char *end;
0219   uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
0220
0221   /* populate shift tables the first time through */
0222   pthread_once(&crc32c_once_hw, crc32c_init_hw);
0223
0224   /* pre-process the crc */
0225   crc0 = crc ^ 0xffffffff;
0226
0227   /* compute the crc for up to seven leading bytes to bring the data pointer
0228        to an eight-byte boundary */
0229   while (len && ((const uintptr_t)next & 7) != 0) {
0230     __asm__(
0231         "crc32b\t"
0232         "(%1), %0"
0233         : "=r"(crc0)
0234         : "r"(next), "0"(crc0));
0235     next++;
0236     len--;
0237   }
0238
0239   /* compute the crc on sets of LONG*3 bytes, executing three independent crc
0240        instructions, each on LONG bytes -- this is optimized for the Nehalem,
0241        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
0242        throughput of one crc per cycle, but a latency of three cycles */
0243   while (len >= LONG * 3) {
0244     crc1 = 0;
0245     crc2 = 0;
0246     end = next + LONG;
0247     do {
0248       __asm__(
0249           "crc32q\t"
0250           "(%3), %0\n\t"
0251           "crc32q\t" LONGx1
0252           "(%3), %1\n\t"
0253           "crc32q\t" LONGx2 "(%3), %2"
0254           : "=r"(crc0), "=r"(crc1), "=r"(crc2)
0255           : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
0256       next += 8;
0257     } while (next < end);
0258     crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
0259     crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
0260     next += LONG * 2;
0261     len -= LONG * 3;
0262   }
0263
0264   /* do the same thing, but now on SHORT*3 blocks for the remaining data less
0265        than a LONG*3 block */
0266   while (len >= SHORT * 3) {
0267     crc1 = 0;
0268     crc2 = 0;
0269     end = next + SHORT;
0270     do {
0271       __asm__(
0272           "crc32q\t"
0273           "(%3), %0\n\t"
0274           "crc32q\t" SHORTx1
0275           "(%3), %1\n\t"
0276           "crc32q\t" SHORTx2 "(%3), %2"
0277           : "=r"(crc0), "=r"(crc1), "=r"(crc2)
0278           : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
0279       next += 8;
0280     } while (next < end);
0281     crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
0282     crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
0283     next += SHORT * 2;
0284     len -= SHORT * 3;
0285   }
0286
0287   /* compute the crc on the remaining eight-byte units less than a SHORT*3
0288        block */
0289   end = next + (len - (len & 7));
0290   while (next < end) {
0291     __asm__(
0292         "crc32q\t"
0293         "(%1), %0"
0294         : "=r"(crc0)
0295         : "r"(next), "0"(crc0));
0296     next += 8;
0297   }
0298   len &= 7;
0299
0300   /* compute the crc for up to seven trailing bytes */
0301   while (len) {
0302     __asm__(
0303         "crc32b\t"
0304         "(%1), %0"
0305         : "=r"(crc0)
0306         : "r"(next), "0"(crc0));
0307     next++;
0308     len--;
0309   }
0310
0311   /* return a post-processed crc */
0312   return (uint32_t)crc0 ^ 0xffffffff;
0313 }
0314
0315 /* Check for SSE 4.2.  SSE 4.2 was first supported in Nehalem processors
0316    introduced in November, 2008.  This does not check for the existence of the
0317    cpuid instruction itself, which was introduced on the 486SL in 1992, so this
0318    will fail on earlier x86 processors.  cpuid works on all Pentium and later
0319    processors. */
0320 #define SSE42(have)                                           \
0321   do {                                                        \
0322     uint32_t eax, ecx;                                        \
0323     eax = 1;                                                  \
0324     __asm__("cpuid" : "=c"(ecx) : "a"(eax) : "%ebx", "%edx"); \
0325     (have) = (ecx >> 20) & 1;                                 \
0326   } while (0)
0327
0328 #endif  //defined(__x86_64__)
0329
0330 /* Compute a CRC-32C.  If the crc32 instruction is available, use the hardware
0331    version.  Otherwise, use the software version. */
0332 uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len) {
0333 #if defined(__x86_64__)
0334   int sse42;
0335
0336   SSE42(sse42);
0337   return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
0338 #else
0339   return crc32c_sw(crc, buf, len);
0340 #endif
0341 }
0342
0343 bool crc32c_hw_test() {
0344 #if defined(__x86_64__)
0345   int sse42;
0346
0347   SSE42(sse42);
0348   return sse42;
0349 #else
0350   return 0;
0351 #endif
0352 }