Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-02-14 13:26:27

0001 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
0002  * Copyright (C) 2013 Mark Adler
0003  * Version 1.1  1 Aug 2013  Mark Adler
0004  */
0005 
0006 /*
0007   This software is provided 'as-is', without any express or implied
0008   warranty.  In no event will the author be held liable for any damages
0009   arising from the use of this software.
0010 
0011   Permission is granted to anyone to use this software for any purpose,
0012   including commercial applications, and to alter it and redistribute it
0013   freely, subject to the following restrictions:
0014 
0015   1. The origin of this software must not be misrepresented; you must not
0016      claim that you wrote the original software. If you use this software
0017      in a product, an acknowledgment in the product documentation would be
0018      appreciated but is not required.
0019   2. Altered source versions must be plainly marked as such, and must not be
0020      misrepresented as being the original software.
0021   3. This notice may not be removed or altered from any source distribution.
0022 
0023   Mark Adler
0024   madler@alumni.caltech.edu
0025  */
0026 
0027 /* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
0028    CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
0029    version is provided as a fall-back, as well as for speed comparisons. */
0030 
0031 /* Version history:
0032    1.0  10 Feb 2013  First version
0033    1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
0034  */
0035 
0036 /* Srecko Morovic, Apr 02 2015: (crc32.cc) modified to compile in c++
0037  * and ifdefs to compile and call hw version only with X86_64
0038  */
0039 
0040 
0041 
0042 #include <cstdio>
0043 #include <cstdlib>
0044 #include <cstdint>
0045 #include <unistd.h>
0046 #include <pthread.h>
0047 
0048 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
0049 #define POLY 0x82f63b78
0050 
0051 /* Table for a quadword-at-a-time software crc. */
0052 static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT;
0053 static uint32_t crc32c_table[8][256];
0054 
0055 /* Construct table for software CRC-32C calculation. */
0056 static void crc32c_init_sw(void)
0057 {
0058     uint32_t n, crc, k;
0059 
0060     for (n = 0; n < 256; n++) {
0061         crc = n;
0062         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0063         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0064         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0065         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0066         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0067         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0068         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0069         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
0070         crc32c_table[0][n] = crc;
0071     }
0072     for (n = 0; n < 256; n++) {
0073         crc = crc32c_table[0][n];
0074         for (k = 1; k < 8; k++) {
0075             crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
0076             crc32c_table[k][n] = crc;
0077         }
0078     }
0079 }
0080 
0081 /* Table-driven software version as a fall-back.  This is about 15 times slower
0082    than using the hardware instructions.  This assumes little-endian integers,
0083    as is the case on Intel processors that the assembler code here is for. */
0084 static uint32_t crc32c_sw(uint32_t crci, const unsigned char *buf, size_t len)
0085 {
0086     const unsigned char *next = buf;
0087     uint64_t crc;
0088 
0089     pthread_once(&crc32c_once_sw, crc32c_init_sw);
0090     crc = crci ^ 0xffffffff;
0091     while (len && ((const uintptr_t)next & 7) != 0) {
0092         crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
0093         len--;
0094     }
0095     while (len >= 8) {
0096         crc ^= *(uint64_t *)next;
0097         crc = crc32c_table[7][crc & 0xff] ^
0098               crc32c_table[6][(crc >> 8) & 0xff] ^
0099               crc32c_table[5][(crc >> 16) & 0xff] ^
0100               crc32c_table[4][(crc >> 24) & 0xff] ^
0101               crc32c_table[3][(crc >> 32) & 0xff] ^
0102               crc32c_table[2][(crc >> 40) & 0xff] ^
0103               crc32c_table[1][(crc >> 48) & 0xff] ^
0104               crc32c_table[0][crc >> 56];
0105         next += 8;
0106         len -= 8;
0107     }
0108     while (len) {
0109         crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
0110         len--;
0111     }
0112     return (uint32_t)crc ^ 0xffffffff;
0113 }
0114 
0115 #if defined(__x86_64__)
0116 /* Multiply a matrix times a vector over the Galois field of two elements,
0117    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
0118    least as many entries as the power of two for most significant one bit in
0119    vec. */
0120 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
0121 {
0122     uint32_t sum;
0123 
0124     sum = 0;
0125     while (vec) {
0126         if (vec & 1)
0127             sum ^= *mat;
0128         vec >>= 1;
0129         mat++;
0130     }
0131     return sum;
0132 }
0133 
0134 /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
0135    rows. */
0136 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat)
0137 {
0138     int n;
0139 
0140     for (n = 0; n < 32; n++)
0141         square[n] = gf2_matrix_times(mat, mat[n]);
0142 }
0143 
0144 /* Construct an operator to apply len zeros to a crc.  len must be a power of
0145    two.  If len is not a power of two, then the result is the same as for the
0146    largest power of two less than len.  The result for len == 0 is the same as
0147    for len == 1.  A version of this routine could be easily written for any
0148    len, but that is not needed for this application. */
0149 static void crc32c_zeros_op(uint32_t *even, size_t len)
0150 {
0151     int n;
0152     uint32_t row;
0153     uint32_t odd[32];       /* odd-power-of-two zeros operator */
0154 
0155     /* put operator for one zero bit in odd */
0156     odd[0] = POLY;              /* CRC-32C polynomial */
0157     row = 1;
0158     for (n = 1; n < 32; n++) {
0159         odd[n] = row;
0160         row <<= 1;
0161     }
0162 
0163     /* put operator for two zero bits in even */
0164     gf2_matrix_square(even, odd);
0165 
0166     /* put operator for four zero bits in odd */
0167     gf2_matrix_square(odd, even);
0168 
0169     /* first square will put the operator for one zero byte (eight zero bits),
0170        in even -- next square puts operator for two zero bytes in odd, and so
0171        on, until len has been rotated down to zero */
0172     do {
0173         gf2_matrix_square(even, odd);
0174         len >>= 1;
0175         if (len == 0)
0176             return;
0177         gf2_matrix_square(odd, even);
0178         len >>= 1;
0179     } while (len);
0180 
0181     /* answer ended up in odd -- copy to even */
0182     for (n = 0; n < 32; n++)
0183         even[n] = odd[n];
0184 }
0185 
0186 /* Take a length and build four lookup tables for applying the zeros operator
0187    for that length, byte-by-byte on the operand. */
0188 static void crc32c_zeros(uint32_t zeros[][256], size_t len)
0189 {
0190     uint32_t n;
0191     uint32_t op[32];
0192 
0193     crc32c_zeros_op(op, len);
0194     for (n = 0; n < 256; n++) {
0195         zeros[0][n] = gf2_matrix_times(op, n);
0196         zeros[1][n] = gf2_matrix_times(op, n << 8);
0197         zeros[2][n] = gf2_matrix_times(op, n << 16);
0198         zeros[3][n] = gf2_matrix_times(op, n << 24);
0199     }
0200 }
0201 
0202 /* Apply the zeros operator table to crc. */
0203 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc)
0204 {
0205     return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
0206            zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
0207 }
0208 
0209 /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
0210    both be powers of two.  The associated string constants must be set
0211    accordingly, for use in constructing the assembler instructions. */
0212 #define LONG 8192
0213 #define LONGx1 "8192"
0214 #define LONGx2 "16384"
0215 #define SHORT 256
0216 #define SHORTx1 "256"
0217 #define SHORTx2 "512"
0218 
0219 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
0220 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
0221 static uint32_t crc32c_long[4][256];
0222 static uint32_t crc32c_short[4][256];
0223 
0224 /* Initialize tables for shifting crcs. */
0225 static void crc32c_init_hw(void)
0226 {
0227     crc32c_zeros(crc32c_long, LONG);
0228     crc32c_zeros(crc32c_short, SHORT);
0229 }
0230 
0231 /* Compute CRC-32C using the Intel hardware instruction. */
0232 static uint32_t crc32c_hw(uint32_t crc, const unsigned char *buf, size_t len)
0233 {
0234     const unsigned char *next = buf;
0235     const unsigned char *end;
0236     uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */
0237 
0238     /* populate shift tables the first time through */
0239     pthread_once(&crc32c_once_hw, crc32c_init_hw);
0240 
0241     /* pre-process the crc */
0242     crc0 = crc ^ 0xffffffff;
0243 
0244     /* compute the crc for up to seven leading bytes to bring the data pointer
0245        to an eight-byte boundary */
0246     while (len && ((const uintptr_t)next & 7) != 0) {
0247         __asm__("crc32b\t" "(%1), %0"
0248                 : "=r"(crc0)
0249                 : "r"(next), "0"(crc0));
0250         next++;
0251         len--;
0252     }
0253 
0254     /* compute the crc on sets of LONG*3 bytes, executing three independent crc
0255        instructions, each on LONG bytes -- this is optimized for the Nehalem,
0256        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
0257        throughput of one crc per cycle, but a latency of three cycles */
0258     while (len >= LONG*3) {
0259         crc1 = 0;
0260         crc2 = 0;
0261         end = next + LONG;
0262         do {
0263             __asm__("crc32q\t" "(%3), %0\n\t"
0264                     "crc32q\t" LONGx1 "(%3), %1\n\t"
0265                     "crc32q\t" LONGx2 "(%3), %2"
0266                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
0267                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
0268             next += 8;
0269         } while (next < end);
0270         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
0271         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
0272         next += LONG*2;
0273         len -= LONG*3;
0274     }
0275 
0276     /* do the same thing, but now on SHORT*3 blocks for the remaining data less
0277        than a LONG*3 block */
0278     while (len >= SHORT*3) {
0279         crc1 = 0;
0280         crc2 = 0;
0281         end = next + SHORT;
0282         do {
0283             __asm__("crc32q\t" "(%3), %0\n\t"
0284                     "crc32q\t" SHORTx1 "(%3), %1\n\t"
0285                     "crc32q\t" SHORTx2 "(%3), %2"
0286                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
0287                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
0288             next += 8;
0289         } while (next < end);
0290         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
0291         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
0292         next += SHORT*2;
0293         len -= SHORT*3;
0294     }
0295 
0296     /* compute the crc on the remaining eight-byte units less than a SHORT*3
0297        block */
0298     end = next + (len - (len & 7));
0299     while (next < end) {
0300         __asm__("crc32q\t" "(%1), %0"
0301                 : "=r"(crc0)
0302                 : "r"(next), "0"(crc0));
0303         next += 8;
0304     }
0305     len &= 7;
0306 
0307     /* compute the crc for up to seven trailing bytes */
0308     while (len) {
0309         __asm__("crc32b\t" "(%1), %0"
0310                 : "=r"(crc0)
0311                 : "r"(next), "0"(crc0));
0312         next++;
0313         len--;
0314     }
0315 
0316     /* return a post-processed crc */
0317     return (uint32_t)crc0 ^ 0xffffffff;
0318 }
0319 
0320 /* Check for SSE 4.2.  SSE 4.2 was first supported in Nehalem processors
0321    introduced in November, 2008.  This does not check for the existence of the
0322    cpuid instruction itself, which was introduced on the 486SL in 1992, so this
0323    will fail on earlier x86 processors.  cpuid works on all Pentium and later
0324    processors. */
0325 #define SSE42(have) \
0326     do { \
0327         uint32_t eax, ecx; \
0328         eax = 1; \
0329         __asm__("cpuid" \
0330                 : "=c"(ecx) \
0331                 : "a"(eax) \
0332                 : "%ebx", "%edx"); \
0333         (have) = (ecx >> 20) & 1; \
0334     } while (0)
0335 
0336 #endif //defined(__x86_64__)
0337 
0338 /* Compute a CRC-32C.  If the crc32 instruction is available, use the hardware
0339    version.  Otherwise, use the software version. */
0340 uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len)
0341 {
0342 #if defined(__x86_64__)
0343     int sse42;
0344 
0345     SSE42(sse42);
0346     return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
0347 #else
0348     return crc32c_sw(crc, buf, len);
0349 #endif
0350 }
0351 
0352 
0353 
0354 bool crc32c_hw_test()
0355 {
0356 
0357 #if defined(__x86_64__)
0358   int sse42;
0359 
0360   SSE42(sse42);
0361   return sse42;
0362 #else
0363   return 0;
0364 #endif
0365 }
0366