00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #include "pch.h"
00068 #include "config.h"
00069
00070 #ifndef CRYPTOPP_IMPORTS
00071 #ifndef CRYPTOPP_GENERATE_X64_MASMrij
00072
00073 #include "rijndael.h"
00074 #include "stdcpp.h"
00075 #include "misc.h"
00076 #include "cpu.h"
00077
00078 NAMESPACE_BEGIN(CryptoPP)
00079
00080
00081 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
00082 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
00083 #endif
00084
00085 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00086 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
00087 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00088 using namespace rdtable;
00089 # else
00090 static word64 Te[256];
00091 # endif
00092 static word64 Td[256];
00093 #else
00094 static word32 Te[256*4], Td[256*4];
00095 #endif
00096 static volatile bool s_TeFilled = false, s_TdFilled = false;
00097
00098
00099
00100 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
00101 a ^= L(T, 3, byte(t)); t >>= 8;\
00102 b ^= L(T, 2, byte(t)); t >>= 8;\
00103 c ^= L(T, 1, byte(t)); t >>= 8;\
00104 d ^= L(T, 0, t);
00105
00106 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00107 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00108 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00109 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00110 tempBlock[d] = ((byte *)(Te+t))[1];
00111
00112 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00113 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00114 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00115 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00116 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00117 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00118 #else
00119 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00120 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00121 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00122 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00123 tempBlock[d] = Sd[t];
00124 #endif
00125
00126 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00127 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00128
00129 #ifdef IS_LITTLE_ENDIAN
00130 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00131 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00132 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00133 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00134 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00135 #else
00136 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
00137 #define TL_M(T, i, x) T[i*256 + x]
00138 #endif
00139 #else
00140 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00141 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00142 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00143 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00144 #define TL_M TL_F
00145 #else
00146 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
00147 #define TL_M(T, i, x) T[i*256 + x]
00148 #endif
00149 #endif
00150
00151
00152 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
00153 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00154 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00155
00156 #define f3(x) (f2(x) ^ x)
00157 #define f9(x) (f8(x) ^ x)
00158 #define fb(x) (f8(x) ^ f2(x) ^ x)
00159 #define fd(x) (f8(x) ^ f4(x) ^ x)
00160 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
00161
00162 void Rijndael::Base::FillEncTable()
00163 {
00164 for (int i=0; i<256; i++)
00165 {
00166 byte x = Se[i];
00167 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00168 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00169 Te[i] = word64(y | f3(x))<<32 | y;
00170 #else
00171 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00172 for (int j=0; j<4; j++)
00173 {
00174 Te[i+j*256] = y;
00175 y = rotrFixed(y, 8);
00176 }
00177 #endif
00178 }
00179 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
00180 Te[256] = Te[257] = 0;
00181 #endif
00182 s_TeFilled = true;
00183 }
00184
00185 void Rijndael::Base::FillDecTable()
00186 {
00187 for (int i=0; i<256; i++)
00188 {
00189 byte x = Sd[i];
00190 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00191 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00192 Td[i] = word64(y | fb(x))<<32 | y | x;
00193 #else
00194 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00195 for (int j=0; j<4; j++)
00196 {
00197 Td[i+j*256] = y;
00198 y = rotrFixed(y, 8);
00199 }
00200 #endif
00201 }
00202 s_TdFilled = true;
00203 }
00204
00205 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00206 {
00207 AssertValidKeyLength(keylen);
00208
00209 m_rounds = keylen/4 + 6;
00210 m_key.New(4*(m_rounds+1));
00211
00212 word32 *rk = m_key;
00213
00214 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
00215
00216 if (HasAESNI())
00217 {
00218 static const word32 rcLE[] = {
00219 0x01, 0x02, 0x04, 0x08,
00220 0x10, 0x20, 0x40, 0x80,
00221 0x1B, 0x36,
00222 };
00223 const word32 *rc = rcLE;
00224
00225 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
00226 memcpy(rk, userKey, keylen);
00227
00228 while (true)
00229 {
00230 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
00231 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00232 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00233 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00234
00235 if (rk + keylen/4 + 4 == m_key.end())
00236 break;
00237
00238 if (keylen == 24)
00239 {
00240 rk[10] = rk[ 4] ^ rk[ 9];
00241 rk[11] = rk[ 5] ^ rk[10];
00242 temp = _mm_insert_epi32(temp, rk[11], 3);
00243 }
00244 else if (keylen == 32)
00245 {
00246 temp = _mm_insert_epi32(temp, rk[11], 3);
00247 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
00248 rk[13] = rk[ 5] ^ rk[12];
00249 rk[14] = rk[ 6] ^ rk[13];
00250 rk[15] = rk[ 7] ^ rk[14];
00251 temp = _mm_insert_epi32(temp, rk[15], 3);
00252 }
00253 else
00254 temp = _mm_insert_epi32(temp, rk[7], 3);
00255
00256 rk += keylen/4;
00257 }
00258
00259 if (!IsForwardTransformation())
00260 {
00261 rk = m_key;
00262 unsigned int i, j;
00263
00264 std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
00265
00266 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00267 {
00268 temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
00269 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
00270 *(__m128i *)(rk+j) = temp;
00271 }
00272
00273 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
00274 }
00275
00276 return;
00277 }
00278 #endif
00279
00280 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00281 const word32 *rc = rcon;
00282 word32 temp;
00283
00284 while (true)
00285 {
00286 temp = rk[keylen/4-1];
00287 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
00288 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
00289 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00290 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00291 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00292
00293 if (rk + keylen/4 + 4 == m_key.end())
00294 break;
00295
00296 if (keylen == 24)
00297 {
00298 rk[10] = rk[ 4] ^ rk[ 9];
00299 rk[11] = rk[ 5] ^ rk[10];
00300 }
00301 else if (keylen == 32)
00302 {
00303 temp = rk[11];
00304 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
00305 rk[13] = rk[ 5] ^ rk[12];
00306 rk[14] = rk[ 6] ^ rk[13];
00307 rk[15] = rk[ 7] ^ rk[14];
00308 }
00309 rk += keylen/4;
00310 }
00311
00312 rk = m_key;
00313
00314 if (IsForwardTransformation())
00315 {
00316 if (!s_TeFilled)
00317 FillEncTable();
00318
00319 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
00320 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
00321 }
00322 else
00323 {
00324 if (!s_TdFilled)
00325 FillDecTable();
00326
00327 unsigned int i, j;
00328
00329 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00330
00331 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00332 {
00333 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
00334 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
00335 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
00336 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
00337 }
00338
00339 rk[i+0] = InverseMixColumn(rk[i+0]);
00340 rk[i+1] = InverseMixColumn(rk[i+1]);
00341 rk[i+2] = InverseMixColumn(rk[i+2]);
00342 rk[i+3] = InverseMixColumn(rk[i+3]);
00343
00344 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
00345 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
00346 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
00347 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
00348 }
00349
00350 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00351 if (HasAESNI())
00352 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
00353 #endif
00354 }
00355
00356 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00357 {
00358 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00359 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
00360 if (HasSSE2())
00361 #else
00362 if (HasAESNI())
00363 #endif
00364 {
00365 return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00366 }
00367 #endif
00368
00369 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00370
00371 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00372 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00373
00374 const word32 *rk = m_key;
00375 s0 ^= rk[0];
00376 s1 ^= rk[1];
00377 s2 ^= rk[2];
00378 s3 ^= rk[3];
00379 t0 = rk[4];
00380 t1 = rk[5];
00381 t2 = rk[6];
00382 t3 = rk[7];
00383 rk += 8;
00384
00385
00386 const int cacheLineSize = GetCacheLineSize();
00387 unsigned int i;
00388 word32 u = 0;
00389 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00390 for (i=0; i<2048; i+=cacheLineSize)
00391 #else
00392 for (i=0; i<1024; i+=cacheLineSize)
00393 #endif
00394 u &= *(const word32 *)(((const byte *)Te)+i);
00395 u &= Te[255];
00396 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00397
00398 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00399 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00400 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00401 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00402
00403
00404 unsigned int r = m_rounds/2 - 1;
00405 do
00406 {
00407 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00408
00409 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00410 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00411 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00412 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00413
00414 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00415
00416 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00417 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00418 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00419 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00420
00421 rk += 8;
00422 } while (--r);
00423
00424 word32 tbw[4];
00425 byte *const tempBlock = (byte *)tbw;
00426
00427 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00428 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00429 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00430 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00431
00432 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00433 }
00434
00435 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00436 {
00437 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00438 if (HasAESNI())
00439 {
00440 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00441 return;
00442 }
00443 #endif
00444
00445 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00446
00447 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00448 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00449
00450 const word32 *rk = m_key;
00451 s0 ^= rk[0];
00452 s1 ^= rk[1];
00453 s2 ^= rk[2];
00454 s3 ^= rk[3];
00455 t0 = rk[4];
00456 t1 = rk[5];
00457 t2 = rk[6];
00458 t3 = rk[7];
00459 rk += 8;
00460
00461
00462 const int cacheLineSize = GetCacheLineSize();
00463 unsigned int i;
00464 word32 u = 0;
00465 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
00466 for (i=0; i<2048; i+=cacheLineSize)
00467 #else
00468 for (i=0; i<1024; i+=cacheLineSize)
00469 #endif
00470 u &= *(const word32 *)(((const byte *)Td)+i);
00471 u &= Td[255];
00472 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00473
00474 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00475 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00476 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00477 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00478
00479
00480 unsigned int r = m_rounds/2 - 1;
00481 do
00482 {
00483 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00484
00485 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00486 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00487 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00488 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00489
00490 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00491
00492 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00493 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00494 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00495 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00496
00497 rk += 8;
00498 } while (--r);
00499
00500 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
00501
00502
00503
00504 u = 0;
00505 for (i=0; i<256; i+=cacheLineSize)
00506 u &= *(const word32 *)(Sd+i);
00507 u &= *(const word32 *)(Sd+252);
00508 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00509 #endif
00510
00511 word32 tbw[4];
00512 byte *const tempBlock = (byte *)tbw;
00513
00514 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00515 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00516 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00517 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00518
00519 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00520 }
00521
00522
00523
00524 #if CRYPTOPP_MSC_VERSION
00525 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00526 #endif
00527
00528 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00529
00530 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
00531
00532 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00533 {
00534 CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
00535
00536 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
00537
00538 #define L_REG esp
00539 #define L_INDEX(i) (L_REG+768+i)
00540 #define L_INXORBLOCKS L_INBLOCKS+4
00541 #define L_OUTXORBLOCKS L_INBLOCKS+8
00542 #define L_OUTBLOCKS L_INBLOCKS+12
00543 #define L_INCREMENTS L_INDEX(16*15)
00544 #define L_SP L_INDEX(16*16)
00545 #define L_LENGTH L_INDEX(16*16+4)
00546 #define L_KEYS_BEGIN L_INDEX(16*16+8)
00547
00548 #define MOVD movd
00549 #define MM(i) mm##i
00550
00551 #define MXOR(a,b,c) \
00552 AS2( movzx esi, b)\
00553 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00554 AS2( pxor MM(a), mm7)\
00555
00556 #define MMOV(a,b,c) \
00557 AS2( movzx esi, b)\
00558 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00559
00560 #else
00561
00562 #define L_REG r8
00563 #define L_INDEX(i) (L_REG+i)
00564 #define L_INXORBLOCKS L_INBLOCKS+8
00565 #define L_OUTXORBLOCKS L_INBLOCKS+16
00566 #define L_OUTBLOCKS L_INBLOCKS+24
00567 #define L_INCREMENTS L_INDEX(16*16)
00568 #define L_LENGTH L_INDEX(16*18+8)
00569 #define L_KEYS_BEGIN L_INDEX(16*19)
00570
00571 #define MOVD mov
00572 #define MM_0 r9d
00573 #define MM_1 r12d
00574 #ifdef __GNUC__
00575 #define MM_2 r11d
00576 #else
00577 #define MM_2 r10d
00578 #endif
00579 #define MM(i) MM_##i
00580
00581 #define MXOR(a,b,c) \
00582 AS2( movzx esi, b)\
00583 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00584
00585 #define MMOV(a,b,c) \
00586 AS2( movzx esi, b)\
00587 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00588
00589 #endif
00590
00591 #define L_SUBKEYS L_INDEX(0)
00592 #define L_SAVED_X L_SUBKEYS
00593 #define L_KEY12 L_INDEX(16*12)
00594 #define L_LASTROUND L_INDEX(16*13)
00595 #define L_INBLOCKS L_INDEX(16*14)
00596 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
00597
00598 #define XOR(a,b,c) \
00599 AS2( movzx esi, b)\
00600 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00601
00602 #define MOV(a,b,c) \
00603 AS2( movzx esi, b)\
00604 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00605
00606 #ifdef CRYPTOPP_GENERATE_X64_MASM
00607 ALIGN 8
00608 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
00609 rex_push_reg rsi
00610 push_reg rdi
00611 push_reg rbx
00612 push_reg r12
00613 .endprolog
00614 mov L_REG, rcx
00615 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00616 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00617 #elif defined(__GNUC__)
00618 __asm__ __volatile__
00619 (
00620 INTEL_NOPREFIX
00621 #if CRYPTOPP_BOOL_X64
00622 AS2( mov L_REG, rcx)
00623 #endif
00624 AS_PUSH_IF86(bx)
00625 AS_PUSH_IF86(bp)
00626 AS2( mov AS_REG_7, WORD_REG(si))
00627 #else
00628 AS_PUSH_IF86(si)
00629 AS_PUSH_IF86(di)
00630 AS_PUSH_IF86(bx)
00631 AS_PUSH_IF86(bp)
00632 AS2( lea AS_REG_7, [Te])
00633 AS2( mov edi, [g_cacheLineSize])
00634 #endif
00635
00636 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
00637 AS2( mov [ecx+16*12+16*4], esp)
00638 AS2( lea esp, [ecx-768])
00639 #endif
00640
00641
00642 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
00643 AS2( mov WORD_REG(ax), 16)
00644 AS2( and WORD_REG(ax), WORD_REG(si))
00645 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
00646 AS2( movdqa [L_KEY12], xmm3)
00647 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00648 AS2( sub WORD_REG(ax), WORD_REG(si))
00649 ASL(0)
00650 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
00651 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00652 AS2( add WORD_REG(si), 16)
00653 AS2( cmp WORD_REG(si), 16*12)
00654 ASJ( jl, 0, b)
00655
00656
00657 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
00658 AS2( movdqa xmm1, [WORD_REG(dx)])
00659 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
00660 AS2( mov ebx, [WORD_REG(dx)+5*4])
00661 AS2( mov ecx, [WORD_REG(dx)+6*4])
00662 AS2( mov edx, [WORD_REG(dx)+7*4])
00663
00664
00665 AS2( xor WORD_REG(ax), WORD_REG(ax))
00666 ASL(9)
00667 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00668 AS2( add WORD_REG(ax), WORD_REG(di))
00669 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00670 AS2( add WORD_REG(ax), WORD_REG(di))
00671 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00672 AS2( add WORD_REG(ax), WORD_REG(di))
00673 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00674 AS2( add WORD_REG(ax), WORD_REG(di))
00675 AS2( cmp WORD_REG(ax), 2048)
00676 ASJ( jl, 9, b)
00677 AS1( lfence)
00678
00679 AS2( test DWORD PTR [L_LENGTH], 1)
00680 ASJ( jz, 8, f)
00681
00682
00683 AS2( mov WORD_REG(si), [L_INBLOCKS])
00684 AS2( movdqu xmm2, [WORD_REG(si)])
00685 AS2( pxor xmm2, xmm1)
00686 AS2( psrldq xmm1, 14)
00687 AS2( movd eax, xmm1)
00688 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
00689 AS2( MOVD MM(2), eax)
00690 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
00691 AS2( mov eax, 1)
00692 AS2( movd mm3, eax)
00693 #endif
00694
00695
00696 AS2( movd eax, xmm2)
00697 AS2( psrldq xmm2, 4)
00698 AS2( movd edi, xmm2)
00699 AS2( psrldq xmm2, 4)
00700 MXOR( 1, al, 0)
00701 XOR( edx, ah, 1)
00702 AS2( shr eax, 16)
00703 XOR( ecx, al, 2)
00704 XOR( ebx, ah, 3)
00705 AS2( mov eax, edi)
00706 AS2( movd edi, xmm2)
00707 AS2( psrldq xmm2, 4)
00708 XOR( ebx, al, 0)
00709 MXOR( 1, ah, 1)
00710 AS2( shr eax, 16)
00711 XOR( edx, al, 2)
00712 XOR( ecx, ah, 3)
00713 AS2( mov eax, edi)
00714 AS2( movd edi, xmm2)
00715 XOR( ecx, al, 0)
00716 XOR( ebx, ah, 1)
00717 AS2( shr eax, 16)
00718 MXOR( 1, al, 2)
00719 XOR( edx, ah, 3)
00720 AS2( mov eax, edi)
00721 XOR( edx, al, 0)
00722 XOR( ecx, ah, 1)
00723 AS2( shr eax, 16)
00724 XOR( ebx, al, 2)
00725 AS2( psrldq xmm2, 3)
00726
00727
00728 AS2( mov eax, [L_KEY12+0*4])
00729 AS2( mov edi, [L_KEY12+2*4])
00730 AS2( MOVD MM(0), [L_KEY12+3*4])
00731 MXOR( 0, cl, 3)
00732 XOR( edi, bl, 3)
00733 MXOR( 0, bh, 2)
00734 AS2( shr ebx, 16)
00735 XOR( eax, bl, 1)
00736 MOV( ebx, bh, 0)
00737 AS2( xor ebx, [L_KEY12+1*4])
00738 XOR( eax, ch, 2)
00739 AS2( shr ecx, 16)
00740 XOR( eax, dl, 3)
00741 XOR( ebx, dh, 2)
00742 AS2( shr edx, 16)
00743 XOR( edi, ch, 0)
00744 XOR( ebx, cl, 1)
00745 XOR( edi, dl, 1)
00746 MXOR( 0, dh, 0)
00747
00748 AS2( movd ecx, xmm2)
00749 AS2( MOVD edx, MM(1))
00750 AS2( MOVD [L_SAVED_X+3*4], MM(0))
00751 AS2( mov [L_SAVED_X+0*4], eax)
00752 AS2( mov [L_SAVED_X+1*4], ebx)
00753 AS2( mov [L_SAVED_X+2*4], edi)
00754 ASJ( jmp, 5, f)
00755
00756 ASL(3)
00757
00758 AS2( MOVD MM(1), [L_KEY12+0*4])
00759 AS2( mov ebx, [L_KEY12+1*4])
00760 AS2( mov ecx, [L_KEY12+2*4])
00761 AS2( mov edx, [L_KEY12+3*4])
00762 ASL(8)
00763 AS2( mov WORD_REG(ax), [L_INBLOCKS])
00764 AS2( movdqu xmm2, [WORD_REG(ax)])
00765 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
00766 AS2( movdqu xmm5, [WORD_REG(si)])
00767 AS2( pxor xmm2, xmm1)
00768 AS2( pxor xmm2, xmm5)
00769
00770
00771 AS2( movd eax, xmm2)
00772 AS2( psrldq xmm2, 4)
00773 AS2( movd edi, xmm2)
00774 AS2( psrldq xmm2, 4)
00775 MXOR( 1, al, 0)
00776 XOR( edx, ah, 1)
00777 AS2( shr eax, 16)
00778 XOR( ecx, al, 2)
00779 XOR( ebx, ah, 3)
00780 AS2( mov eax, edi)
00781 AS2( movd edi, xmm2)
00782 AS2( psrldq xmm2, 4)
00783 XOR( ebx, al, 0)
00784 MXOR( 1, ah, 1)
00785 AS2( shr eax, 16)
00786 XOR( edx, al, 2)
00787 XOR( ecx, ah, 3)
00788 AS2( mov eax, edi)
00789 AS2( movd edi, xmm2)
00790 XOR( ecx, al, 0)
00791 XOR( ebx, ah, 1)
00792 AS2( shr eax, 16)
00793 MXOR( 1, al, 2)
00794 XOR( edx, ah, 3)
00795 AS2( mov eax, edi)
00796 XOR( edx, al, 0)
00797 XOR( ecx, ah, 1)
00798 AS2( shr eax, 16)
00799 XOR( ebx, al, 2)
00800 MXOR( 1, ah, 3)
00801 AS2( MOVD eax, MM(1))
00802
00803 AS2( add L_REG, [L_KEYS_BEGIN])
00804 AS2( add L_REG, 4*16)
00805 ASJ( jmp, 2, f)
00806
00807 ASL(1)
00808
00809 AS2( MOVD ecx, MM(2))
00810 AS2( MOVD edx, MM(1))
00811 AS2( mov eax, [L_SAVED_X+0*4])
00812 AS2( mov ebx, [L_SAVED_X+1*4])
00813 AS2( xor cl, ch)
00814 AS2( and WORD_REG(cx), 255)
00815 ASL(5)
00816 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
00817 AS2( paddb MM(2), mm3)
00818 #else
00819 AS2( add MM(2), 1)
00820 #endif
00821
00822 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00823 XOR( ebx, dl, 3)
00824 MOV( ecx, dh, 2)
00825 AS2( shr edx, 16)
00826 AS2( xor ecx, [L_SAVED_X+2*4])
00827 XOR( eax, dh, 0)
00828 MOV( edx, dl, 1)
00829 AS2( xor edx, [L_SAVED_X+3*4])
00830
00831 AS2( add L_REG, [L_KEYS_BEGIN])
00832 AS2( add L_REG, 3*16)
00833 ASJ( jmp, 4, f)
00834
00835
00836
00837 #define ROUND() \
00838 MXOR( 0, cl, 3) \
00839 AS2( mov cl, al) \
00840 XOR( edi, ah, 2) \
00841 AS2( shr eax, 16) \
00842 XOR( edi, bl, 3) \
00843 MXOR( 0, bh, 2) \
00844 AS2( shr ebx, 16) \
00845 MXOR( 0, al, 1) \
00846 MOV( eax, ah, 0) \
00847 XOR( eax, bl, 1) \
00848 MOV( ebx, bh, 0) \
00849 XOR( eax, ch, 2) \
00850 XOR( ebx, cl, 3) \
00851 AS2( shr ecx, 16) \
00852 XOR( eax, dl, 3) \
00853 XOR( ebx, dh, 2) \
00854 AS2( shr edx, 16) \
00855 XOR( edi, ch, 0) \
00856 XOR( ebx, cl, 1) \
00857 XOR( edi, dl, 1) \
00858 MXOR( 0, dh, 0) \
00859
00860 ASL(2)
00861 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
00862 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
00863 ROUND()
00864 AS2( mov ecx, edi)
00865 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
00866 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
00867 AS2( MOVD edx, MM(0))
00868
00869 ASL(4)
00870 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
00871 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
00872 ROUND()
00873 AS2( mov ecx, edi)
00874 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
00875 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
00876 AS2( MOVD edx, MM(0))
00877
00878 AS2( add L_REG, 32)
00879 AS2( test L_REG, 255)
00880 ASJ( jnz, 2, b)
00881 AS2( sub L_REG, 16*16)
00882
00883 #define LAST(a, b, c) \
00884 AS2( movzx esi, a )\
00885 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
00886 AS2( movzx esi, b )\
00887 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
00888 AS2( mov WORD PTR [L_LASTROUND+c], di )\
00889
00890
00891 LAST(ch, dl, 2)
00892 LAST(dh, al, 6)
00893 AS2( shr edx, 16)
00894 LAST(ah, bl, 10)
00895 AS2( shr eax, 16)
00896 LAST(bh, cl, 14)
00897 AS2( shr ebx, 16)
00898 LAST(dh, al, 12)
00899 AS2( shr ecx, 16)
00900 LAST(ah, bl, 0)
00901 LAST(bh, cl, 4)
00902 LAST(ch, dl, 8)
00903
00904 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
00905 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
00906
00907 AS2( mov WORD_REG(cx), [L_LENGTH])
00908 AS2( sub WORD_REG(cx), 16)
00909
00910 AS2( movdqu xmm2, [WORD_REG(ax)])
00911 AS2( pxor xmm2, xmm4)
00912
00913 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
00914 AS2( movdqa xmm0, [L_INCREMENTS])
00915 AS2( paddd xmm0, [L_INBLOCKS])
00916 AS2( movdqa [L_INBLOCKS], xmm0)
00917 #else
00918 AS2( movdqa xmm0, [L_INCREMENTS+16])
00919 AS2( paddq xmm0, [L_INBLOCKS+16])
00920 AS2( movdqa [L_INBLOCKS+16], xmm0)
00921 #endif
00922
00923 AS2( pxor xmm2, [L_LASTROUND])
00924 AS2( movdqu [WORD_REG(bx)], xmm2)
00925
00926 ASJ( jle, 7, f)
00927 AS2( mov [L_LENGTH], WORD_REG(cx))
00928 AS2( test WORD_REG(cx), 1)
00929 ASJ( jnz, 1, b)
00930 #if CRYPTOPP_BOOL_X64
00931 AS2( movdqa xmm0, [L_INCREMENTS])
00932 AS2( paddq xmm0, [L_INBLOCKS])
00933 AS2( movdqa [L_INBLOCKS], xmm0)
00934 #endif
00935 ASJ( jmp, 3, b)
00936
00937 ASL(7)
00938
00939 AS2( xorps xmm0, xmm0)
00940 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
00941 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
00942 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
00943 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
00944 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
00945 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
00946 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
00947 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
00948 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
00949 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
00950 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
00951 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
00952 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
00953 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
00954 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
00955 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
00956 AS2( mov esp, [L_SP])
00957 AS1( emms)
00958 #endif
00959 AS_POP_IF86(bp)
00960 AS_POP_IF86(bx)
00961 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00962 AS_POP_IF86(di)
00963 AS_POP_IF86(si)
00964 AS1(ret)
00965 #endif
00966 #ifdef CRYPTOPP_GENERATE_X64_MASM
00967 pop r12
00968 pop rbx
00969 pop rdi
00970 pop rsi
00971 ret
00972 Rijndael_Enc_AdvancedProcessBlocks ENDP
00973 #endif
00974 #ifdef __GNUC__
00975 ATT_PREFIX
00976 :
00977 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00978 : "memory", "cc", "%eax"
00979 #if CRYPTOPP_BOOL_X64
00980 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00981 #endif
00982 );
00983 #endif
00984 }
00985
00986 #endif
00987
00988 #ifndef CRYPTOPP_GENERATE_X64_MASM
00989
00990 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00991 extern "C" {
00992 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00993 }
00994 #endif
00995
00996 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
00997
00998 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00999 {
01000 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
01001 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
01002 if (t1 > t0)
01003 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
01004 else
01005 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
01006 }
01007
01008 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01009
01010 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01011 {
01012 block = _mm_xor_si128(block, subkeys[0]);
01013 for (unsigned int i=1; i<rounds-1; i+=2)
01014 {
01015 block = _mm_aesenc_si128(block, subkeys[i]);
01016 block = _mm_aesenc_si128(block, subkeys[i+1]);
01017 }
01018 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
01019 block = _mm_aesenclast_si128(block, subkeys[rounds]);
01020 }
01021
01022 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01023 {
01024 __m128i rk = subkeys[0];
01025 block0 = _mm_xor_si128(block0, rk);
01026 block1 = _mm_xor_si128(block1, rk);
01027 block2 = _mm_xor_si128(block2, rk);
01028 block3 = _mm_xor_si128(block3, rk);
01029 for (unsigned int i=1; i<rounds; i++)
01030 {
01031 rk = subkeys[i];
01032 block0 = _mm_aesenc_si128(block0, rk);
01033 block1 = _mm_aesenc_si128(block1, rk);
01034 block2 = _mm_aesenc_si128(block2, rk);
01035 block3 = _mm_aesenc_si128(block3, rk);
01036 }
01037 rk = subkeys[rounds];
01038 block0 = _mm_aesenclast_si128(block0, rk);
01039 block1 = _mm_aesenclast_si128(block1, rk);
01040 block2 = _mm_aesenclast_si128(block2, rk);
01041 block3 = _mm_aesenclast_si128(block3, rk);
01042 }
01043
01044 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01045 {
01046 block = _mm_xor_si128(block, subkeys[0]);
01047 for (unsigned int i=1; i<rounds-1; i+=2)
01048 {
01049 block = _mm_aesdec_si128(block, subkeys[i]);
01050 block = _mm_aesdec_si128(block, subkeys[i+1]);
01051 }
01052 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
01053 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
01054 }
01055
01056 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01057 {
01058 __m128i rk = subkeys[0];
01059 block0 = _mm_xor_si128(block0, rk);
01060 block1 = _mm_xor_si128(block1, rk);
01061 block2 = _mm_xor_si128(block2, rk);
01062 block3 = _mm_xor_si128(block3, rk);
01063 for (unsigned int i=1; i<rounds; i++)
01064 {
01065 rk = subkeys[i];
01066 block0 = _mm_aesdec_si128(block0, rk);
01067 block1 = _mm_aesdec_si128(block1, rk);
01068 block2 = _mm_aesdec_si128(block2, rk);
01069 block3 = _mm_aesdec_si128(block3, rk);
01070 }
01071 rk = subkeys[rounds];
01072 block0 = _mm_aesdeclast_si128(block0, rk);
01073 block1 = _mm_aesdeclast_si128(block1, rk);
01074 block2 = _mm_aesdeclast_si128(block2, rk);
01075 block3 = _mm_aesdeclast_si128(block3, rk);
01076 }
01077
01078 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
01079
01080 template <typename F1, typename F4>
01081 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
01082 {
01083 size_t blockSize = 16;
01084 size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
01085 size_t xorIncrement = xorBlocks ? blockSize : 0;
01086 size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
01087
01088 if (flags & BlockTransformation::BT_ReverseDirection)
01089 {
01090 assert(length % blockSize == 0);
01091 inBlocks += length - blockSize;
01092 xorBlocks += length - blockSize;
01093 outBlocks += length - blockSize;
01094 inIncrement = 0-inIncrement;
01095 xorIncrement = 0-xorIncrement;
01096 outIncrement = 0-outIncrement;
01097 }
01098
01099 if (flags & BlockTransformation::BT_AllowParallel)
01100 {
01101 while (length >= 4*blockSize)
01102 {
01103 __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
01104 if (flags & BlockTransformation::BT_InBlockIsCounter)
01105 {
01106 const __m128i be1 = *(const __m128i *)s_one;
01107 block1 = _mm_add_epi32(block0, be1);
01108 block2 = _mm_add_epi32(block1, be1);
01109 block3 = _mm_add_epi32(block2, be1);
01110 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
01111 }
01112 else
01113 {
01114 inBlocks += inIncrement;
01115 block1 = _mm_loadu_si128((const __m128i *)inBlocks);
01116 inBlocks += inIncrement;
01117 block2 = _mm_loadu_si128((const __m128i *)inBlocks);
01118 inBlocks += inIncrement;
01119 block3 = _mm_loadu_si128((const __m128i *)inBlocks);
01120 inBlocks += inIncrement;
01121 }
01122
01123 if (flags & BlockTransformation::BT_XorInput)
01124 {
01125 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01126 xorBlocks += xorIncrement;
01127 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01128 xorBlocks += xorIncrement;
01129 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01130 xorBlocks += xorIncrement;
01131 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01132 xorBlocks += xorIncrement;
01133 }
01134
01135 func4(block0, block1, block2, block3, subkeys, rounds);
01136
01137 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01138 {
01139 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01140 xorBlocks += xorIncrement;
01141 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01142 xorBlocks += xorIncrement;
01143 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01144 xorBlocks += xorIncrement;
01145 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01146 xorBlocks += xorIncrement;
01147 }
01148
01149 _mm_storeu_si128((__m128i *)outBlocks, block0);
01150 outBlocks += outIncrement;
01151 _mm_storeu_si128((__m128i *)outBlocks, block1);
01152 outBlocks += outIncrement;
01153 _mm_storeu_si128((__m128i *)outBlocks, block2);
01154 outBlocks += outIncrement;
01155 _mm_storeu_si128((__m128i *)outBlocks, block3);
01156 outBlocks += outIncrement;
01157
01158 length -= 4*blockSize;
01159 }
01160 }
01161
01162 while (length >= blockSize)
01163 {
01164 __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
01165
01166 if (flags & BlockTransformation::BT_XorInput)
01167 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01168
01169 if (flags & BlockTransformation::BT_InBlockIsCounter)
01170 const_cast<byte *>(inBlocks)[15]++;
01171
01172 func1(block, subkeys, rounds);
01173
01174 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01175 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01176
01177 _mm_storeu_si128((__m128i *)outBlocks, block);
01178
01179 inBlocks += inIncrement;
01180 outBlocks += outIncrement;
01181 xorBlocks += xorIncrement;
01182 length -= blockSize;
01183 }
01184
01185 return length;
01186 }
01187 #endif
01188
01189 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01190 {
01191 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01192 if (HasAESNI())
01193 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01194 #endif
01195
01196 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
01197 if (HasSSE2())
01198 {
01199 if (length < BLOCKSIZE)
01200 return length;
01201
01202 struct Locals
01203 {
01204 word32 subkeys[4*12], workspace[8];
01205 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
01206 byte *outBlocks;
01207 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
01208 size_t regSpill, lengthAndCounterFlag, keysBegin;
01209 };
01210
01211 size_t increment = BLOCKSIZE;
01212 const byte* zeros = (byte *)(Te+256);
01213 byte *space;
01214
01215 do {
01216
01217 #if (CRYPTOPP_MSC_VERION >= 1400)
01218 space = (byte *)_malloca(255+sizeof(Locals));
01219 space += (256-(size_t)space%256)%256;
01220 #else
01221 space = (byte *)alloca(255+sizeof(Locals));
01222 space += (256-(size_t)space%256)%256;
01223 #endif
01224 }
01225 while (AliasedWithTable(space, space+sizeof(Locals)));
01226
01227 if (flags & BT_ReverseDirection)
01228 {
01229 assert(length % BLOCKSIZE == 0);
01230 inBlocks += length - BLOCKSIZE;
01231 xorBlocks += length - BLOCKSIZE;
01232 outBlocks += length - BLOCKSIZE;
01233 increment = 0-increment;
01234 }
01235
01236 Locals &locals = *(Locals *)space;
01237
01238 locals.inBlocks = inBlocks;
01239 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
01240 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
01241 locals.outBlocks = outBlocks;
01242
01243 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01244 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
01245 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
01246 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01247
01248 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
01249 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
01250 locals.keysBegin = (12-keysToCopy)*16;
01251
01252 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
01253 return length % BLOCKSIZE;
01254 }
01255 #endif
01256
01257 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01258 }
01259
01260 #endif
01261
01262 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01263
01264 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01265 {
01266 if (HasAESNI())
01267 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01268
01269 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01270 }
01271
01272 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01273
01274 NAMESPACE_END
01275
01276 #endif
01277 #endif