00001
00002
00003
00004
00005 #include "pch.h"
00006 #include "config.h"
00007
00008 #ifndef CRYPTOPP_GENERATE_X64_MASM
00009
00010 #include "salsa.h"
00011 #include "argnames.h"
00012 #include "misc.h"
00013 #include "cpu.h"
00014
00015 #if CRYPTOPP_MSC_VERSION
00016 # pragma warning(disable: 4702 4740)
00017 #endif
00018
00019
00020
00021
00022
00023 #if defined(CRYPTOPP_DISABLE_SALSA_ASM)
00024 # undef CRYPTOPP_X86_ASM_AVAILABLE
00025 # undef CRYPTOPP_X32_ASM_AVAILABLE
00026 # undef CRYPTOPP_X64_ASM_AVAILABLE
00027 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00028 # undef CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
00029 # define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
00030 # define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
00031 #endif
00032
00033 NAMESPACE_BEGIN(CryptoPP)
00034
00035 #if !defined(NDEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
00036 void Salsa20_TestInstantiations()
00037 {
00038 Salsa20::Encryption x;
00039 }
00040 #endif
00041
00042 void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
00043 {
00044 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00045
00046 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00047 throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
00048
00049
00050 GetBlock<word32, LittleEndian> get1(key);
00051 get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
00052 GetBlock<word32, LittleEndian> get2(key + length - 16);
00053 get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
00054
00055
00056 m_state[0] = 0x61707865;
00057 m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
00058 m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
00059 m_state[3] = 0x6b206574;
00060 }
00061
00062 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
00063 {
00064 CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
00065 assert(length==8);
00066
00067 GetBlock<word32, LittleEndian> get(IV);
00068 get(m_state[14])(m_state[11]);
00069 m_state[8] = m_state[5] = 0;
00070 }
00071
00072 void Salsa20_Policy::SeekToIteration(lword iterationCount)
00073 {
00074 m_state[8] = (word32)iterationCount;
00075 m_state[5] = (word32)SafeRightShift<32>(iterationCount);
00076 }
00077
00078 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SALSA_ASM)
00079 unsigned int Salsa20_Policy::GetAlignment() const
00080 {
00081 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00082 if (HasSSE2())
00083 return 16;
00084 else
00085 #endif
00086 return GetAlignmentOf<word32>();
00087 }
00088
00089 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
00090 {
00091 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00092 if (HasSSE2())
00093 return 4*BYTES_PER_ITERATION;
00094 else
00095 #endif
00096 return BYTES_PER_ITERATION;
00097 }
00098 #endif
00099
00100 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00101 extern "C" {
00102 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
00103 }
00104 #endif
00105
00106 #if CRYPTOPP_MSC_VERSION
00107 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00108 #endif
00109
00110 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00111 {
00112 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
00113
00114 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00115 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
00116 return;
00117 #endif
00118
00119 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00120 #ifdef CRYPTOPP_GENERATE_X64_MASM
00121 ALIGN 8
00122 Salsa20_OperateKeystream PROC FRAME
00123 mov r10, [rsp + 5*8] ; state
00124 alloc_stack(10*16 + 32*16 + 8)
00125 save_xmm128 xmm6, 0200h
00126 save_xmm128 xmm7, 0210h
00127 save_xmm128 xmm8, 0220h
00128 save_xmm128 xmm9, 0230h
00129 save_xmm128 xmm10, 0240h
00130 save_xmm128 xmm11, 0250h
00131 save_xmm128 xmm12, 0260h
00132 save_xmm128 xmm13, 0270h
00133 save_xmm128 xmm14, 0280h
00134 save_xmm128 xmm15, 0290h
00135 .endprolog
00136
00137 #define REG_output rcx
00138 #define REG_input rdx
00139 #define REG_iterationCount r8
00140 #define REG_state r10
00141 #define REG_rounds e9d
00142 #define REG_roundsLeft eax
00143 #define REG_temp32 r11d
00144 #define REG_temp r11
00145 #define SSE2_WORKSPACE rsp
00146 #else
00147 if (HasSSE2())
00148 {
00149 #if CRYPTOPP_BOOL_X64
00150 #define REG_output %1
00151 #define REG_input %0
00152 #define REG_iterationCount %2
00153 #define REG_state %4
00154 #define REG_rounds %3
00155 #define REG_roundsLeft eax
00156 #define REG_temp32 edx
00157 #define REG_temp rdx
00158 #define SSE2_WORKSPACE %5
00159
00160 CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32];
00161 #else
00162 #define REG_output edi
00163 #define REG_input eax
00164 #define REG_iterationCount ecx
00165 #define REG_state esi
00166 #define REG_rounds edx
00167 #define REG_roundsLeft ebx
00168 #define REG_temp32 ebp
00169 #define REG_temp ebp
00170 #define SSE2_WORKSPACE esp + WORD_SZ
00171 #endif
00172
00173 #ifdef __GNUC__
00174 __asm__ __volatile__
00175 (
00176 INTEL_NOPREFIX
00177 AS_PUSH_IF86( bx)
00178 #else
00179 void *s = m_state.data();
00180 word32 r = m_rounds;
00181
00182 AS2( mov REG_iterationCount, iterationCount)
00183 AS2( mov REG_input, input)
00184 AS2( mov REG_output, output)
00185 AS2( mov REG_state, s)
00186 AS2( mov REG_rounds, r)
00187 #endif
00188 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00189
00190 AS_PUSH_IF86( bp)
00191 AS2( cmp REG_iterationCount, 4)
00192 ASJ( jl, 5, f)
00193
00194 #if CRYPTOPP_BOOL_X86
00195 AS2( mov ebx, esp)
00196 AS2( and esp, -16)
00197 AS2( sub esp, 32*16)
00198 AS1( push ebx)
00199 #endif
00200
00201 #define SSE2_EXPAND_S(i, j) \
00202 ASS( pshufd xmm4, xmm##i, j, j, j, j) \
00203 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
00204
00205 AS2( movdqa xmm0, [REG_state + 0*16])
00206 AS2( movdqa xmm1, [REG_state + 1*16])
00207 AS2( movdqa xmm2, [REG_state + 2*16])
00208 AS2( movdqa xmm3, [REG_state + 3*16])
00209 SSE2_EXPAND_S(0, 0)
00210 SSE2_EXPAND_S(0, 1)
00211 SSE2_EXPAND_S(0, 2)
00212 SSE2_EXPAND_S(0, 3)
00213 SSE2_EXPAND_S(1, 0)
00214 SSE2_EXPAND_S(1, 2)
00215 SSE2_EXPAND_S(1, 3)
00216 SSE2_EXPAND_S(2, 1)
00217 SSE2_EXPAND_S(2, 2)
00218 SSE2_EXPAND_S(2, 3)
00219 SSE2_EXPAND_S(3, 0)
00220 SSE2_EXPAND_S(3, 1)
00221 SSE2_EXPAND_S(3, 2)
00222 SSE2_EXPAND_S(3, 3)
00223
00224 #define SSE2_EXPAND_S85(i) \
00225 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
00226 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
00227 AS2( add REG_roundsLeft, 1) \
00228 AS2( adc REG_temp32, 0)
00229
00230 ASL(1)
00231 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
00232 AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
00233 SSE2_EXPAND_S85(0)
00234 SSE2_EXPAND_S85(1)
00235 SSE2_EXPAND_S85(2)
00236 SSE2_EXPAND_S85(3)
00237 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
00238 AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
00239
00240 #define SSE2_QUARTER_ROUND(a, b, d, i) \
00241 AS2( movdqa xmm4, xmm##d) \
00242 AS2( paddd xmm4, xmm##a) \
00243 AS2( movdqa xmm5, xmm4) \
00244 AS2( pslld xmm4, i) \
00245 AS2( psrld xmm5, 32-i) \
00246 AS2( pxor xmm##b, xmm4) \
00247 AS2( pxor xmm##b, xmm5)
00248
00249 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
00250 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256])
00251 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
00252 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00253 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
00254 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
00255 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
00256 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
00257 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
00258 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00259 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
00260 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00261 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
00262 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
00263 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
00264 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
00265 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
00266 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00267 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B)
00268 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00269 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
00270 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
00271 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
00272 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
00273 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
00274 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D)
00275 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00276 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
00277 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
00278 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C)
00279 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
00280 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
00281
00282 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
00283 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
00284 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
00285 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
00286 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
00287 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
00288 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
00289 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
00290 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
00291 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
00292 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
00293 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
00294 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
00295 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
00296 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
00297 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
00298 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
00299 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
00300 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
00301 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
00302 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
00303 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
00304 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
00305 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
00306 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
00307 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
00308 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
00309 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
00310 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
00311 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
00312 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
00313 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
00314 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
00315
00316 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
00317 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
00318 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
00319 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
00320 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
00321 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
00322 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
00323 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
00324 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
00325 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
00326 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
00327 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
00328 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
00329 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
00330 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
00331 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
00332 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
00333 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
00334 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
00335 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
00336 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
00337 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
00338 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
00339 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
00340 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
00341 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
00342 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
00343 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
00344 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
00345 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
00346 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
00347 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
00348 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
00349
00350 #if CRYPTOPP_BOOL_X64
00351 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00352 #else
00353 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
00354 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
00355 #endif
00356 AS2( mov REG_roundsLeft, REG_rounds)
00357 ASJ( jmp, 2, f)
00358
00359 ASL(SSE2_Salsa_Output)
00360 AS2( movdqa xmm0, xmm4)
00361 AS2( punpckldq xmm4, xmm5)
00362 AS2( movdqa xmm1, xmm6)
00363 AS2( punpckldq xmm6, xmm7)
00364 AS2( movdqa xmm2, xmm4)
00365 AS2( punpcklqdq xmm4, xmm6)
00366 AS2( punpckhqdq xmm2, xmm6)
00367 AS2( punpckhdq xmm0, xmm5)
00368 AS2( punpckhdq xmm1, xmm7)
00369 AS2( movdqa xmm6, xmm0)
00370 AS2( punpcklqdq xmm0, xmm1)
00371 AS2( punpckhqdq xmm6, xmm1)
00372 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
00373 AS1( ret)
00374
00375 ASL(6)
00376 #if CRYPTOPP_BOOL_X64
00377 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00378 ASL(2)
00379 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
00380 #else
00381 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
00382 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
00383 ASL(2)
00384 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
00385 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
00386 #endif
00387 AS2( sub REG_roundsLeft, 2)
00388 ASJ( jnz, 6, b)
00389
00390 #define SSE2_OUTPUT_4(a, b, c, d) \
00391 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
00392 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
00393 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
00394 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
00395 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
00396 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
00397 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
00398 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
00399 ASC( call, SSE2_Salsa_Output)
00400
00401 SSE2_OUTPUT_4(0, 13, 10, 7)
00402 SSE2_OUTPUT_4(4, 1, 14, 11)
00403 SSE2_OUTPUT_4(8, 5, 2, 15)
00404 SSE2_OUTPUT_4(12, 9, 6, 3)
00405 AS2( test REG_input, REG_input)
00406 ASJ( jz, 9, f)
00407 AS2( add REG_input, 12*16)
00408 ASL(9)
00409 AS2( add REG_output, 12*16)
00410 AS2( sub REG_iterationCount, 4)
00411 AS2( cmp REG_iterationCount, 4)
00412 ASJ( jge, 1, b)
00413 AS_POP_IF86( sp)
00414
00415 ASL(5)
00416 AS2( sub REG_iterationCount, 1)
00417 ASJ( jl, 4, f)
00418 AS2( movdqa xmm0, [REG_state + 0*16])
00419 AS2( movdqa xmm1, [REG_state + 1*16])
00420 AS2( movdqa xmm2, [REG_state + 2*16])
00421 AS2( movdqa xmm3, [REG_state + 3*16])
00422 AS2( mov REG_roundsLeft, REG_rounds)
00423
00424 ASL(0)
00425 SSE2_QUARTER_ROUND(0, 1, 3, 7)
00426 SSE2_QUARTER_ROUND(1, 2, 0, 9)
00427 SSE2_QUARTER_ROUND(2, 3, 1, 13)
00428 SSE2_QUARTER_ROUND(3, 0, 2, 18)
00429 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
00430 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
00431 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
00432 SSE2_QUARTER_ROUND(0, 3, 1, 7)
00433 SSE2_QUARTER_ROUND(3, 2, 0, 9)
00434 SSE2_QUARTER_ROUND(2, 1, 3, 13)
00435 SSE2_QUARTER_ROUND(1, 0, 2, 18)
00436 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
00437 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
00438 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
00439 AS2( sub REG_roundsLeft, 2)
00440 ASJ( jnz, 0, b)
00441
00442 AS2( paddd xmm0, [REG_state + 0*16])
00443 AS2( paddd xmm1, [REG_state + 1*16])
00444 AS2( paddd xmm2, [REG_state + 2*16])
00445 AS2( paddd xmm3, [REG_state + 3*16])
00446
00447 AS2( add dword ptr [REG_state + 8*4], 1)
00448 AS2( adc dword ptr [REG_state + 5*4], 0)
00449
00450 AS2( pcmpeqb xmm6, xmm6)
00451 AS2( psrlq xmm6, 32)
00452 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3)
00453 AS2( movdqa xmm4, xmm0)
00454 AS2( movdqa xmm5, xmm3)
00455 AS2( pand xmm0, xmm7)
00456 AS2( pand xmm4, xmm6)
00457 AS2( pand xmm3, xmm6)
00458 AS2( pand xmm5, xmm7)
00459 AS2( por xmm4, xmm5)
00460 AS2( movdqa xmm5, xmm1)
00461 AS2( pand xmm1, xmm7)
00462 AS2( pand xmm5, xmm6)
00463 AS2( por xmm0, xmm5)
00464 AS2( pand xmm6, xmm2)
00465 AS2( pand xmm2, xmm7)
00466 AS2( por xmm1, xmm6)
00467 AS2( por xmm2, xmm3)
00468
00469 AS2( movdqa xmm5, xmm4)
00470 AS2( movdqa xmm6, xmm0)
00471 AS3( shufpd xmm4, xmm1, 2)
00472 AS3( shufpd xmm0, xmm2, 2)
00473 AS3( shufpd xmm1, xmm5, 2)
00474 AS3( shufpd xmm2, xmm6, 2)
00475
00476
00477 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
00478 ASJ( jmp, 5, b)
00479 ASL(4)
00480
00481 AS_POP_IF86( bp)
00482 #ifdef __GNUC__
00483 AS_POP_IF86( bx)
00484 ATT_PREFIX
00485 #if CRYPTOPP_BOOL_X64
00486 : "+r" (input), "+r" (output), "+r" (iterationCount)
00487 : "r" (m_rounds), "r" (m_state.m_ptr), "r" (workspace)
00488 : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
00489 #else
00490 : "+a" (input), "+D" (output), "+c" (iterationCount)
00491 : "d" (m_rounds), "S" (m_state.m_ptr)
00492 : "memory", "cc"
00493 #endif
00494 );
00495 #endif
00496 #ifdef CRYPTOPP_GENERATE_X64_MASM
00497 movdqa xmm6, [rsp + 0200h]
00498 movdqa xmm7, [rsp + 0210h]
00499 movdqa xmm8, [rsp + 0220h]
00500 movdqa xmm9, [rsp + 0230h]
00501 movdqa xmm10, [rsp + 0240h]
00502 movdqa xmm11, [rsp + 0250h]
00503 movdqa xmm12, [rsp + 0260h]
00504 movdqa xmm13, [rsp + 0270h]
00505 movdqa xmm14, [rsp + 0280h]
00506 movdqa xmm15, [rsp + 0290h]
00507 add rsp, 10*16 + 32*16 + 8
00508 ret
00509 Salsa20_OperateKeystream ENDP
00510 #else
00511 }
00512 else
00513 #endif
00514 #endif
00515 #ifndef CRYPTOPP_GENERATE_X64_MASM
00516 {
00517 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00518
00519 while (iterationCount--)
00520 {
00521 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
00522 x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
00523 x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
00524 x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
00525
00526 for (int i=m_rounds; i>0; i-=2)
00527 {
00528 #define QUARTER_ROUND(a, b, c, d) \
00529 b = b ^ rotlFixed(a + d, 7); \
00530 c = c ^ rotlFixed(b + a, 9); \
00531 d = d ^ rotlFixed(c + b, 13); \
00532 a = a ^ rotlFixed(d + c, 18);
00533
00534 QUARTER_ROUND(x0, x4, x8, x12)
00535 QUARTER_ROUND(x1, x5, x9, x13)
00536 QUARTER_ROUND(x2, x6, x10, x14)
00537 QUARTER_ROUND(x3, x7, x11, x15)
00538
00539 QUARTER_ROUND(x0, x13, x10, x7)
00540 QUARTER_ROUND(x1, x14, x11, x4)
00541 QUARTER_ROUND(x2, x15, x8, x5)
00542 QUARTER_ROUND(x3, x12, x9, x6)
00543 }
00544
00545 #define SALSA_OUTPUT(x) {\
00546 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
00547 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
00548 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
00549 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
00550 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
00551 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
00552 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
00553 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
00554 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
00555 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
00556 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
00557 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
00558 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
00559 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
00560 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
00561 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
00562
00563 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00564 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
00565 #endif
00566
00567 if (++m_state[8] == 0)
00568 ++m_state[5];
00569 }
00570 }
00571 }
00572
00573 void XSalsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
00574 {
00575 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00576
00577 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00578 throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
00579
00580 GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
00581 if (length == 16)
00582 memcpy(m_key.begin()+4, m_key.begin(), 16);
00583
00584
00585 m_state[0] = 0x61707865;
00586 m_state[1] = 0x3320646e;
00587 m_state[2] = 0x79622d32;
00588 m_state[3] = 0x6b206574;
00589 }
00590
00591 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
00592 {
00593 CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
00594 assert(length==24);
00595
00596 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00597
00598 GetBlock<word32, LittleEndian> get(IV);
00599 get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
00600
00601 x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
00602 x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
00603 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
00604
00605 for (int i=m_rounds; i>0; i-=2)
00606 {
00607 QUARTER_ROUND(x0, x4, x8, x12)
00608 QUARTER_ROUND(x1, x5, x9, x13)
00609 QUARTER_ROUND(x2, x6, x10, x14)
00610 QUARTER_ROUND(x3, x7, x11, x15)
00611
00612 QUARTER_ROUND(x0, x13, x10, x7)
00613 QUARTER_ROUND(x1, x14, x11, x4)
00614 QUARTER_ROUND(x2, x15, x8, x5)
00615 QUARTER_ROUND(x3, x12, x9, x6)
00616 }
00617
00618 m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
00619 m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
00620 m_state[8] = m_state[5] = 0;
00621 }
00622
00623 NAMESPACE_END
00624
00625 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM