Skip to content
Snippets Groups Projects
Commit ddc2a6fd authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

remove Random123 sub-directory, not needed in this branch.

parent d52a08c1
No related branches found
No related tags found
1 merge request!866New random_unit_interval implementation
Showing
with 0 additions and 3359 deletions
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _r123array_dot_h__
#define _r123array_dot_h__
#include "features/compilerfeatures.h"
#include "features/sse.h"
#ifndef __cplusplus
#define CXXMETHODS(_N, W, T)
#define CXXOVERLOADS(_N, W, T)
#else
#include <stddef.h>
#include <algorithm>
#include <stdexcept>
#include <iterator>
#include <limits>
#include <iostream>
/** @defgroup arrayNxW The r123arrayNxW classes
Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
It is functionally equivalent to the C++0x std::array<N, uintW_t>,
but does not require C++0x features or libraries.
In addition to meeting most of the requirements of a Container,
it also has a member function, incr(), which increments the zero-th
element and carrys overflows into higher indexed elements. Thus,
by using incr(), sequences of up to 2^(N*W) distinct values
can be produced.
If SSE is supported by the compiler, then the class
r123array1xm128i is also defined, in which the data member is an
array of one r123128i object.
@cond HIDDEN_FROM_DOXYGEN
*/
template <typename value_type>
inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
value_type v=0;
for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
v |= ((value_type)(*p32++)) << (32*i);
return v;
}
// Work-alike methods and typedefs modeled on std::array:
#define CXXMETHODS(_N, W, T) \
typedef T value_type; \
typedef T* iterator; \
typedef const T* const_iterator; \
typedef value_type& reference; \
typedef const value_type& const_reference; \
typedef size_t size_type; \
typedef ptrdiff_t difference_type; \
typedef T* pointer; \
typedef const T* const_pointer; \
typedef std::reverse_iterator<iterator> reverse_iterator; \
typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
/* Boost.array has static_size. C++11 specializes tuple_size */ \
enum {static_size = _N}; \
R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \
R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \
R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
R123_CUDA_DEVICE size_type size() const { return _N; } \
R123_CUDA_DEVICE size_type max_size() const { return _N; } \
R123_CUDA_DEVICE bool empty() const { return _N==0; }; \
R123_CUDA_DEVICE iterator begin() { return &v[0]; } \
R123_CUDA_DEVICE iterator end() { return &v[_N]; } \
R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \
R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \
R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \
R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \
R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \
R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \
R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
R123_CUDA_DEVICE pointer data(){ return &v[0]; } \
R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \
R123_CUDA_DEVICE reference front(){ return v[0]; } \
R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \
R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \
R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \
R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
/* CUDA3 does not have std::equal */ \
for (size_t i = 0; i < _N; ++i) \
if (v[i] != rhs.v[i]) return false; \
return true; \
} \
R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
/* CUDA3 does not have std::fill_n */ \
R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
/* CUDA3 does not have std::swap_ranges */ \
for (size_t i = 0; i < _N; ++i) { \
T tmp = v[i]; \
v[i] = rhs.v[i]; \
rhs.v[i] = tmp; \
} \
} \
R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \
/* This test is tricky because we're trying to avoid spurious \
complaints about illegal shifts, yet still be compile-time \
evaulated. */ \
if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
return incr_carefully(n); \
if(n==1){ \
++v[0]; \
if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this; \
}else{ \
v[0] += n; \
if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this; \
} \
/* We expect that the N==?? tests will be \
constant-folded/optimized away by the compiler, so only the \
overflow tests (!!v[i]) remain to be done at runtime. For \
small values of N, it would be better to do this as an \
uncondtional sequence of adc. An experiment/optimization \
for another day... \
N.B. The weird subscripting: v[_N>3?3:0] is to silence \
a spurious error from icpc \
*/ \
++v[_N>1?1:0]; \
if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
++v[_N>2?2:0]; \
if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \
++v[_N>3?3:0]; \
for(size_t i=4; i<_N; ++i){ \
if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \
++v[i]; \
} \
return *this; \
} \
/* seed(SeedSeq) would be a constructor if having a constructor */ \
/* didn't cause headaches with defaults */ \
template <typename SeedSeq> \
R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \
r123array##_N##x##W ret; \
const size_t Ngen = _N*((3+sizeof(value_type))/4); \
uint32_t u32[Ngen]; \
uint32_t *p32 = &u32[0]; \
ss.generate(&u32[0], &u32[Ngen]); \
for(size_t i=0; i<_N; ++i){ \
ret.v[i] = assemble_from_u32<value_type>(p32); \
p32 += (3+sizeof(value_type))/4; \
} \
return ret; \
} \
protected: \
R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
/* n may be greater than the maximum value of a single value_type */ \
value_type vtn; \
vtn = n; \
v[0] += n; \
const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
for(size_t i=1; i<_N; ++i){ \
if(rshift){ \
n >>= rshift; \
}else{ \
n=0; \
} \
if( v[i-1] < vtn ) \
++n; \
if( n==0 ) break; \
vtn = n; \
v[i] += n; \
} \
return *this; \
} \
// There are several tricky considerations for the insertion and extraction
// operators:
// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
// not as 16 bytes.
// - we would like to be able to print r123array1xm128i.
// - we do not want an int conversion operator in r123m128i because it causes
// lots of ambiguity problems with automatic promotions.
// Solution: r123arrayinsertable and r123arrayextractable
template<typename T>
struct r123arrayinsertable{
const T& v;
r123arrayinsertable(const T& t_) : v(t_) {}
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
return os << t.v;
}
};
template<>
struct r123arrayinsertable<uint8_t>{
const uint8_t& v;
r123arrayinsertable(const uint8_t& t_) : v(t_) {}
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
return os << (int)t.v;
}
};
template<typename T>
struct r123arrayextractable{
T& v;
r123arrayextractable(T& t_) : v(t_) {}
friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
return is >> t.v;
}
};
template<>
struct r123arrayextractable<uint8_t>{
uint8_t& v;
r123arrayextractable(uint8_t& t_) : v(t_) {}
friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
int i;
is >> i;
t.v = i;
return is;
}
};
#define CXXOVERLOADS(_N, W, T) \
\
inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \
os << r123arrayinsertable<T>(a.v[0]); \
for(size_t i=1; i<_N; ++i) \
os << " " << r123arrayinsertable<T>(a.v[i]); \
return os; \
} \
\
inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \
for(size_t i=0; i<_N; ++i){ \
r123arrayextractable<T> x(a.v[i]); \
is >> x; \
} \
return is; \
} \
\
namespace r123{ \
typedef r123array##_N##x##W Array##_N##x##W; \
}
#endif /* __cplusplus */
/* _r123array_tpl expands to a declaration of struct r123arrayNxW.
In C, it's nothing more than a struct containing an array of N
objects of type T.
In C++ it's the same, but endowed with an assortment of member
functions, typedefs and friends. In C++, r123arrayNxW looks a lot
like std::array<T,N>, has most of the capabilities of a container,
and satisfies the requirements outlined in compat/Engine.hpp for
counter and key types. ArrayNxW, in the r123 namespace is
a typedef equivalent to r123arrayNxW.
*/
#define _r123array_tpl(_N, W, T) \
/** @ingroup arrayNxW */ \
/** @see arrayNxW */ \
struct r123array##_N##x##W{ \
T v[_N]; \
CXXMETHODS(_N, W, T) \
}; \
\
CXXOVERLOADS(_N, W, T)
/** @endcond */
_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */
_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */
_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */
_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */
_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */
_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */
_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */
_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */
#if R123_USE_SSE
_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
#endif
/* In C++, it's natural to use sizeof(a::value_type), but in C it's
pretty convoluted to figure out the width of the value_type of an
r123arrayNxW:
*/
#define R123_W(a) (8*sizeof(((a *)0)->v[0]))
/** @namespace r123
Most of the Random123 C++ API is contained in the r123 namespace.
*/
#endif
/*
Copyright 2010-2016, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __clangfeatures_dot_hpp
#define __clangfeatures_dot_hpp
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)))
#endif
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
#endif
#ifndef R123_USE_CXX11_STATIC_ASSERT
#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
#endif
// With clang-3.6, -Wall warns about unused-local-typedefs.
// The "obvious" thing to do is to ignore -Wunused-local-typedefs,
// but that doesn't work because earlier versions of clang blow
// up on an 'unknown warning group'. So we briefly ignore -Wall...
// It's tempting to just give up on static assertions in pre-c++11 code.
#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT)
#define R123_STATIC_ASSERT(expr, msg) \
_Pragma("clang diagnostic push") \
_Pragma("clang diagnostic ignored \"-Wall\"") \
typedef char static_assertion[(!!(expr))*2-1] \
_Pragma("clang diagnostic pop")
#endif
#ifndef R123_USE_CXX11_CONSTEXPR
#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
#endif
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
#endif
// With clang-3.0, the apparently simpler:
// #define R123_USE_CXX11_RANDOM __has_include(<random>)
// dumps core.
#ifndef R123_USE_CXX11_RANDOM
#if __cplusplus>=201103L && __has_include(<random>)
#define R123_USE_CXX11_RANDOM 1
#else
#define R123_USE_CXX11_RANDOM 0
#endif
#endif
#ifndef R123_USE_CXX11_TYPE_TRAITS
#if __cplusplus>=201103L && __has_include(<type_traits>)
#define R123_USE_CXX11_TYPE_TRAITS 1
#else
#define R123_USE_CXX11_TYPE_TRAITS 0
#endif
#endif
#include "gccfeatures.h"
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
@page porting Preprocessor symbols for porting Random123 to different platforms.
The Random123 library is portable across C, C++, CUDA, OpenCL environments,
and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
This level of portability requires the abstraction of some features
and idioms that are either not standardized (e.g., asm statments), or for which
different vendors have their own standards (e.g., SSE intrinsics) or for
which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
Random123/features/compilerfeatures.h
conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
defines appropriate values for the preprocessor symbols which can be used with
a specific compiler or OS. Those symbols will then
be used by other header files and source files in the Random123
library (and may be used by applications) to control what actually
gets presented to the compiler.
Most of the symbols are boolean valued. In general, they will
\b always be defined with value either 1 or 0, so do
\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead.
Library users can override any value by defining the pp-symbol with a compiler option,
e.g.,
cc -DR123_USE_MULHILO64_C99
will use a strictly c99 version of the full-width 64x64->128-bit multiplication
function, even if it would be disabled by default.
All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
@verbatim
AES_NI
AES_OPENSSL
SSE4_2
SSE4_1
SSE
STD_RANDOM
GNU_UINT128
ASM_GNU
ASM_MSASM
CPUID_MSVC
CXX11_RANDOM
CXX11_TYPE_TRAITS
CXX11_STATIC_ASSERT
CXX11_CONSTEXPR
CXX11_UNRESTRICTED_UNIONS
CXX11_EXPLICIT_CONVERSIONS
CXX11_LONG_LONG
CXX11
X86INTRIN_H
IA32INTRIN_H
XMMINTRIN_H
EMMINTRIN_H
SMMINTRIN_H
WMMINTRIN_H
INTRIN_H
MULHILO32_ASM
MULHILO64_ASM
MULHILO64_MSVC_INTRIN
MULHILO64_CUDA_INTRIN
MULHILO64_OPENCL_INTRIN
MULHILO64_C99
U01_DOUBLE
@endverbatim
Most have obvious meanings. Some non-obvious ones:
AES_NI and AES_OPENSSL are not mutually exclusive. You can have one,
both or neither.
GNU_UINT128 says that it's safe to use __uint128_t, but it
does not require its use. In particular, it should be
used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
If the XXXINTRIN_H macros are true, then one should
@code
#include <xxxintrin.h>
@endcode
to gain accesss to compiler intrinsics.
The CXX11_SOME_FEATURE macros allow the code to use specific
features of the C++11 language and library. The catchall
In the absence of a specific CXX11_SOME_FEATURE, the feature
is controlled by the catch-all R123_USE_CXX11 macro.
U01_DOUBLE defaults on, and can be turned off (set to 0)
if one does not want the utility functions that convert to double
(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
There are a number of invariants that are always true. Application code may
choose to rely on these:
<ul>
<li>ASM_GNU and ASM_MASM are mutually exclusive
<li>The "higher" SSE values imply the lower ones.
</ul>
There are also non-boolean valued symbols:
<ul>
<li>R123_STATIC_INLINE -
According to both C99 and GNU99, the 'static inline' declaration allows
the compiler to not emit code if the function is not used.
Note that the semantics of 'inline', 'static' and 'extern' in
gcc have changed over time and are subject to modification by
command line options, e.g., -std=gnu89, -fgnu-inline.
Nevertheless, it appears that the meaning of 'static inline'
has not changed over time and (with a little luck) the use of 'static inline'
here will be portable between versions of gcc and to other C99
compilers.
See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
http://www.greenend.org.uk/rjk/2003/03/inline.html
<li>R123_FORCE_INLINE(decl) -
which expands to 'decl', adorned with the compiler-specific
embellishments to strongly encourage that the declared function be
inlined. If there is no such compiler-specific magic, it should
expand to decl, unadorned.
<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
sufficiently similar semantics) when CUDA is in use, and expands
to nothing in other cases.
<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
all if we're in an environment so feature-poor that you can't even
call assert (I'm looking at you, CUDA and OpenCL), or even include
assert.h safely (OpenCL).
<li>R123_STATIC_ASSERT(expr,msg) - which expands to
static_assert(expr,msg), or to an expression that
will raise a compile-time exception if expr is not true.
<li>R123_ULONG_LONG - which expands to a declaration of the longest available
unsigned integer.
<li>R123_64BIT(x) - expands to something equivalent to
UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
is not available, e.g., MSVC and OpenCL.
<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
the semantics of gcc's __builtin_expect(expr,likely_value). If
the environment has nothing like __builtin_expect, it should expand
to just expr.
</ul>
\cond HIDDEN_FROM_DOXYGEN
*/
/*
N.B. When something is added to the list of features, it should be
added to each of the *features.h files, AND to examples/ut_features.cpp.
*/
/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
#if defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
#include "openclfeatures.h"
#elif defined(__CUDACC__)
#include "nvccfeatures.h"
#elif defined(__ICC)
#include "iccfeatures.h"
#elif defined(__xlC__)
#include "xlcfeatures.h"
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
#include "sunprofeatures.h"
#elif defined(__OPEN64__)
#include "open64features.h"
#elif defined(__clang__)
#include "clangfeatures.h"
#elif defined(__GNUC__)
#include "gccfeatures.h"
#elif defined(__PGI)
#include "pgccfeatures.h"
#elif defined(_MSC_FULL_VER)
#include "msvcfeatures.h"
#else
#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp"
{ /* maybe an unbalanced brace will terminate the compilation */
#endif
#ifndef R123_USE_CXX11
#define R123_USE_CXX11 0
#endif
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
#endif
//#ifndef R123_USE_CXX11_STATIC_ASSERT
//#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
//#endif
//#ifndef R123_USE_CXX11_CONSTEXPR
//#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
//#endif
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_RANDOM
#define R123_USE_CXX11_RANDOM R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_TYPE_TRAITS
#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_LONG_LONG
#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
#endif
#ifndef R123_USE_MULHILO64_C99
#define R123_USE_MULHILO64_C99 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
//#ifndef R123_STATIC_ASSERT
//#if R123_USE_CXX11_STATIC_ASSERT
//#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
//#else
// /* if msg always_looked_like_this, we could paste it into the name. Worth it? */
//#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
//#endif
//#endif
#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
//#ifndef R123_CONSTEXPR
//#if R123_USE_CXX11_CONSTEXPR
//#define R123_CONSTEXPR constexpr
//#else
#define R123_CONSTEXPR
//#endif
//#endif
#ifndef R123_USE_PHILOX_64BIT
#define R123_USE_PHILOX_64BIT (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN)
#endif
#ifndef R123_ULONG_LONG
#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
/* C++98 doesn't have long long. It doesn't have uint64_t either, but
we will have typedef'ed uint64_t to something in the xxxfeatures.h.
With luck, it won't elicit complaints from -pedantic. Cross your
fingers... */
#define R123_ULONG_LONG uint64_t
#else
#define R123_ULONG_LONG unsigned long long
#endif
#endif
/* UINT64_C should have been #defined by XXXfeatures.h, either by
#include <stdint.h> or through compiler-dependent hacks */
#ifndef R123_64BIT
#define R123_64BIT(x) UINT64_C(x)
#endif
#ifndef R123_THROW
#define R123_THROW(x) throw (x)
#endif
/*
* Windows.h (and perhaps other "well-meaning" code define min and
* max, so there's a high chance that our definition of min, max
* methods or use of std::numeric_limits min and max will cause
* complaints in any program that happened to include Windows.h or
* suchlike first. We use the null macro below in our own header
* files definition or use of min, max to defensively preclude
* this problem. It may not be enough; one might need to #define
* NOMINMAX before including Windows.h or compile with -DNOMINMAX.
*/
#define R123_NO_MACRO_SUBST
/** \endcond */
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __gccfeatures_dot_hpp
#define __gccfeatures_dot_hpp
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
# error "This code has only been tested on x86 and powerpc platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifdef __powerpc__
#include <ppu_intrinsics.h>
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static __inline__
#endif
#ifndef R123_FORCE_INLINE
#if R123_GNUC_VERSION >= 40000
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#else
#define R123_FORCE_INLINE(decl) decl
#endif
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
/* According to the C++0x standard, we should be able to test the numeric
value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++0x
But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
since early 2001, which was finally fixed in 4.7 (early 2012). For
earlier versions, the only way to detect whether --std=c++0x was requested
on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
*/
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && defined(__GCC_EXPERIMENTAL_CXX0X__) ))
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_STATIC_ASSERT
#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_CONSTEXPR
#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_RANDOM
#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_TYPE_TRAITS
#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
#endif
#ifndef R123_USE_AES_NI
#ifdef __AES__
#define R123_USE_AES_NI 1
#else
#define R123_USE_AES_NI 0
#endif
#endif
#ifndef R123_USE_SSE4_2
#ifdef __SSE4_2__
#define R123_USE_SSE4_2 1
#else
#define R123_USE_SSE4_2 0
#endif
#endif
#ifndef R123_USE_SSE4_1
#ifdef __SSE4_1__
#define R123_USE_SSE4_1 1
#else
#define R123_USE_SSE4_1 0
#endif
#endif
#ifndef R123_USE_SSE
/* There's no point in trying to compile SSE code in Random123
unless SSE2 is available. */
#ifdef __SSE2__
#define R123_USE_SSE 1
#else
#define R123_USE_SSE 0
#endif
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#ifdef __x86_64__
#define R123_USE_GNU_UINT128 1
#else
#define R123_USE_GNU_UINT128 0
#endif
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU (defined(__x86_64__)||defined(__i386__))
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)) && R123_GNUC_VERSION >= 40402)
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
emmintrin.h in the include search path. This is
so broken that I refuse to try to work around it. If this
affects you, figure out where your emmintrin.h lives and
add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */
#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
#endif
#ifndef R123_MULHILO64_MULHI_INTRIN
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef R123_MULHILO32_MULHI_INTRIN
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __icpcfeatures_dot_hpp
#define __icpcfeatures_dot_hpp
// icc relies on gcc libraries and other toolchain components.
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#if !defined(__x86_64__) && !defined(__i386__)
# error "This code has only been tested on x86 platforms."
{ // maybe an unbalanced brace will terminate the compilation
// You are invited to try Easy123 on other architectures, by changing
// the conditions that reach this error, but you should consider it a
// porting exercise and expect to encounter bugs and deficiencies.
// Please let the authors know of any successes (or failures).
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
// The basic idiom is:
// #ifndef R123_SOMETHING
// #if some condition
// #define R123_SOMETHING 1
// #else
// #define R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
// An alternative idiom is:
// #ifndef R123_SOMETHING
// #define R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
// pp-symbols.
#ifndef R123_USE_SSE4_2
#ifdef __SSE4_2__
#define R123_USE_SSE4_2 1
#else
#define R123_USE_SSE4_2 0
#endif
#endif
#ifndef R123_USE_SSE4_1
#ifdef __SSE4_1__
#define R123_USE_SSE4_1 1
#else
#define R123_USE_SSE4_1 0
#endif
#endif
#ifndef R123_USE_SSE
#ifdef __SSE2__
#define R123_USE_SSE 1
#else
#define R123_USE_SSE 0
#endif
#endif
#ifndef R123_USE_AES_NI
// Unlike gcc, icc (version 12) does not pre-define an __AES__
// pp-symbol when -maes or -xHost is on the command line. This feels
// like a defect in icc (it defines __SSE4_2__ in analogous
// circumstances), but until Intel fixes it, we're better off erring
// on the side of caution and not generating instructions that are
// going to raise SIGILL when executed. To get the AES-NI
// instructions with icc, the caller must puts something like
// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the
// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
// 11.1 onwards.
//
#define R123_USE_AES_NI ((__ICC>=1101) && defined(__AES__))
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 1
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 1
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 1
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 1
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 1
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 0
#endif
#ifndef R123_USE_MULHILO16_ASM
#define R123_USE_MULHILO16_ASM 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __msvcfeatures_dot_hpp
#define __msvcfeatures_dot_hpp
//#if _MSVC_FULL_VER <= 15
//#error "We've only tested MSVC_FULL_VER==15."
//#endif
#if !defined(_M_IX86) && !defined(_M_X64)
# error "This code has only been tested on x86 platforms."
{ // maybe an unbalanced brace will terminate the compilation
// You are invited to try Random123 on other architectures, by changing
// the conditions that reach this error, but you should consider it a
// porting exercise and expect to encounter bugs and deficiencies.
// Please let the authors know of any successes (or failures).
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static __inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) _forceinline decl
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
// The basic idiom is:
// #ifndef R123_SOMETHING
// #if some condition
// #define R123_SOMETHING 1
// #else
// #define R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
// An alternative idiom is:
// #ifndef R123_SOMETHING
// #define R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
// pp-symbols.
#ifndef R123_USE_AES_NI
#if defined(_M_X64)
#define R123_USE_AES_NI 1
#else
#define R123_USE_AES_NI 0
#endif
#endif
#ifndef R123_USE_SSE4_2
#if defined(_M_X64)
#define R123_USE_SSE4_2 1
#else
#define R123_USE_SSE4_2 0
#endif
#endif
#ifndef R123_USE_SSE4_1
#if defined(_M_X64)
#define R123_USE_SSE4_1 1
#else
#define R123_USE_SSE4_1 0
#endif
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 1
#endif
#ifndef R123_USE_AES_OPENSSL
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 0
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 1
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 1
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 1
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 1
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 1
#endif
#ifndef R123_USE_MULHILO16_ASM
#define R123_USE_MULHILO16_ASM 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#if defined(_M_X64)
#define R123_USE_MULHILO64_MSVC_INTRIN 1
#else
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
#pragma warning(disable:4244)
#pragma warning(disable:4996)
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __r123_nvcc_features_dot_h__
#define __r123_nvcc_features_dot_h__
#if !defined(CUDART_VERSION)
#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
#endif
#if CUDART_VERSION < 4010
#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h"
// This test was added in Random123-1.08 (August, 2013) because we
// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
// earlier. We can't be sure this bug doesn't also affect invocations
// of other templated functions, e.g., essentially all of Random123.
// Thus, we no longer trust CUDA versions earlier than 4.1 even though
// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
// If you feel lucky or desperate, you can change #error to #warning, but
// please take extra care to be sure that you are getting correct
// results.
#endif
// nvcc falls through to gcc or msvc. So first define
// a couple of things and then include either gccfeatures.h
// or msvcfeatures.h
//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
//for both device and host functions in CUDA by setting compiler flags
//for the device function
#ifdef __CUDA_ARCH__
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE __device__
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 1
#endif
#ifndef R123_THROW
// No exceptions in CUDA, at least upto 4.0
#define R123_THROW(x) R123_ASSERT(0)
#endif
#ifndef R123_ASSERT
#define R123_ASSERT(x) if((x)) ; else asm("trap;")
#endif
#else // ! __CUDA_ARCH__
// If we're using nvcc not compiling for the CUDA architecture,
// then we must be compiling for the host. In that case,
// tell the philox code to use the mulhilo64 asm because
// nvcc doesn't grok uint128_t.
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#endif // __CUDA_ARCH__
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_ULONG_LONG
// uint64_t, which is what we'd get without this, is
// not the same as unsigned long long
#define R123_ULONG_LONG unsigned long long
#endif
#if defined(__GNUC__)
#include "gccfeatures.h"
#elif defined(_MSC_FULL_VER)
#include "msvcfeatures.h"
#endif
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __open64features_dot_hpp
#define __open64features_dot_hpp
/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */
/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
but produce incorrect code for 64-bit philox. The MULHILO64_ASM
seems to work fine */
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#include "gccfeatures.h"
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __openclfeatures_dot_hpp
#define __openclfeatures_dot_hpp
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#define R123_ASSERT(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 1
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
// ulong to mul_hi. And gets lots of complaints from stdint.h
// on some machines.
// But these typedefs mean we cannot include stdint.h with
// these headers? Do we need R123_64T, R123_32T, R123_8T?
typedef ulong uint64_t;
typedef uint uint32_t;
typedef uchar uint8_t;
#define UINT64_C(x) ((ulong)(x##UL))
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Copyright (c) 2013, Los Alamos National Security, LLC
All rights reserved.
Copyright 2013. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
to produce derivative works, such modified software should be clearly marked,
so as not to confuse it with the version available from LANL.
*/
#ifndef __pgccfeatures_dot_hpp
#define __pgccfeatures_dot_hpp
#if !defined(__x86_64__) && !defined(__i386__)
# error "This code has only been tested on x86 platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
/* Found this example in PGI's emmintrin.h. */
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) (expr)
#endif
/* PGI through 13.2 doesn't appear to support AES-NI. */
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
ABM, but not SSE4.1 or SSE4.2. */
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
/* There's no point in trying to compile SSE code in Random123
unless SSE2 is available. */
#ifdef __SSE2__
#define R123_USE_SSE 1
#else
#define R123_USE_SSE 0
#endif
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 1
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
about undefined references to _mm_castsi128_ps(__m128i). Why? */
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 1
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 1
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 0
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#ifdef __ABM__
#define R123_USE_INTRIN_H 1
#else
#define R123_USE_INTRIN_H 0
#endif
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _Random123_sse_dot_h__
#define _Random123_sse_dot_h__
//#ifndef R123_USE_ASM_GNU
//#define R123_USE_ASM_GNU (defined(__x86_64__)||defined(__i386__))
//#endif
//#ifndef R123_USE_X86INTRIN_H
//#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)) && R123_GNUC_VERSION >= 40402)
//#endif
#if R123_USE_SSE
//#if R123_USE_X86INTRIN_H
//#include <x86intrin.h>
//#endif
#if R123_USE_IA32INTRIN_H
#include <ia32intrin.h>
#endif
#if R123_USE_XMMINTRIN_H
#include <xmmintrin.h>
#endif
#if R123_USE_EMMINTRIN_H
#include <emmintrin.h>
#endif
#if R123_USE_SMMINTRIN_H
#include <smmintrin.h>
#endif
#if R123_USE_WMMINTRIN_H
#include <wmmintrin.h>
#endif
#if R123_USE_INTRIN_H
#include <intrin.h>
#endif
#ifdef __cplusplus
#include <iostream>
#include <limits>
#include <stdexcept>
#endif
// There is a lot of annoying and inexplicable variation in the
// SSE intrinsics available in different compilation environments.
// The details seem to depend on the compiler, the version and
// the target architecture. Rather than insisting on
// R123_USE_feature tests for each of these in each of the
// compilerfeatures.h files we just keep the complexity localized
// to here...
#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
added _mm_set_epi64x to icc version 12.1 in Jan 2012.
*/
R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
union{
uint64_t u64;
uint32_t u32[2];
} u1, u0;
u1.u64 = v1;
u0.u64 = v0;
return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
}
#endif
/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
assertions in ut_M128.cpp and ut_carray.cpp when we use the
_mm_cvtsi128_si64 intrinsic. (See
https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
Finally, even if the intrinsic exists, it may be spelled with or
without the 'x'.
*/
#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
union{
uint64_t u64[2];
__m128i m;
}u;
_mm_store_si128(&u.m, si);
return u.u64[0];
}
#elif defined(__llvm__) || defined(__ICC)
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
return (uint64_t)_mm_cvtsi128_si64(si);
}
#else /* GNUC, others */
/* FWIW, gcc's emmintrin.h has had the 'x' spelling
since at least gcc-3.4.4. The no-'x' spelling showed up
around 4.2. */
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
return (uint64_t)_mm_cvtsi128_si64x(si);
}
#endif
#if defined(__GNUC__) && __GNUC__ < 4
/* the cast builtins showed up in gcc4. */
R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
return (__m128)si;
}
#endif
#ifdef __cplusplus
struct r123m128i{
__m128i m;
#if R123_USE_CXX11_UNRESTRICTED_UNIONS
// C++98 forbids a union member from having *any* constructors.
// C++11 relaxes this, and allows union members to have constructors
// as long as there is a "trivial" default construtor. So in C++11
// we can provide a r123m128i constructor with an __m128i argument, and still
// have the default (and hence trivial) default constructor.
r123m128i() = default;
r123m128i(__m128i _m): m(_m){}
#endif
r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
// With C++0x we can attach explicit to the bool conversion operator
// to disambiguate undesired promotions. For g++, this works
// only in 4.5 and above.
explicit operator bool() const {return _bool();}
#else
// Pre-C++0x, we have to do something else. Google for the "safe bool"
// idiom for other ideas...
operator const void*() const{return _bool()?this:0;}
#endif
operator __m128i() const {return m;}
private:
#if R123_USE_SSE4_1
bool _bool() const{ return !_mm_testz_si128(m,m); }
#else
bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
#endif
};
R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
__m128i& c = v.m;
__m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
c = _mm_add_epi64(c, zeroone);
//return c;
#if R123_USE_SSE4_1
__m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
__m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
c = _mm_add_epi64(c, onezero);
}
#else
unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
// The low two bits of mask are 11 iff the low 64 bits of
// c are zero.
if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
__m128i onezero = _mm_set_epi64x(1,0);
c = _mm_add_epi64(c, onezero);
}
#endif
return v;
}
R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
__m128i c = lhs.m;
__m128i incr128 = _mm_set_epi64x(0, n);
c = _mm_add_epi64(c, incr128);
// return c; // NO CARRY!
int64_t lo64 = _mm_extract_lo64(c);
if((uint64_t)lo64 < n)
c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
lhs.m = c;
return lhs;
}
// We need this one because it's present, but never used in r123array1xm128i::incr
R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
// The comparisons aren't implemented, but if we leave them out, and
// somebody writes, e.g., M1 < M2, the compiler will do an implicit
// conversion through void*. Sigh...
R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
return !(lhs==rhs);}
R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
return !(lhs==rhs);}
R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
union{
uint64_t u64[2];
__m128i m;
}u;
_mm_storeu_si128(&u.m, m.m);
return os << u.u64[0] << " " << u.u64[1];
}
R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
uint64_t u64[2];
is >> u64[0] >> u64[1];
m.m = _mm_set_epi64x(u64[1], u64[0]);
return is;
}
template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
template <>
inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
r123m128i ret;
ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
return ret;
}
#else
typedef struct {
__m128i m;
} r123m128i;
#endif /* __cplusplus */
#else /* !R123_USE_SSE */
R123_STATIC_INLINE int haveAESNI(){
return 0;
}
#endif /* R123_USE_SSE */
#endif /* _Random123_sse_dot_h__ */
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __sunprofeatures_dot_hpp
#define __sunprofeatures_dot_hpp
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
// The basic idiom is:
// #ifndef R123_SOMETHING
// #if some condition
// #define R123_SOMETHING 1
// #else
// #define R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
// An alternative idiom is:
// #ifndef R123_SOMETHING
// #define R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
// pp-symbols.
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 0
#endif
#ifndef R123_USE_AES_OPENSSL
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 0
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 0
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 0
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 0
#endif
#ifndef R123_USE_MULHILO16_ASM
#define R123_USE_MULHILO16_ASM 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef R123_USE_PHILOX_64BIT
#define R123_USE_PHILOX_64BIT 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Copyright (c) 2013, Los Alamos National Security, LLC
All rights reserved.
Copyright 2013. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
to produce derivative works, such modified software should be clearly marked,
so as not to confuse it with the version available from LANL.
*/
#ifndef __xlcfeatures_dot_hpp
#define __xlcfeatures_dot_hpp
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
# error "This code has only been tested on x86 and PowerPC platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifdef __cplusplus
/* builtins are automatically available to xlc. To use them with xlc++,
one must include builtins.h. c.f
http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
*/
#include <builtins.h>
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 0
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 1
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 0
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 0
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#ifdef __ABM__
#define R123_USE_INTRIN_H 1
#else
#define R123_USE_INTRIN_H 0
#endif
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
#endif
#ifndef R123_MULHILO64_MULHI_INTRIN
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef R123_MULHILO32_MULHI_INTRIN
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM (defined(__powerpc64__) && !(R123_USE_MULHILO64_MULHI_INTRIN))
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _threefry_dot_h_
#define _threefry_dot_h_
#include "features/compilerfeatures.h"
#include "array.h"
/** \cond HIDDEN_FROM_DOXYGEN */
/* Significant parts of this file were copied from
from:
Skein_FinalRnd/ReferenceImplementation/skein.h
Skein_FinalRnd/ReferenceImplementation/skein_block.c
in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
This file has been modified so that it may no longer perform its originally
intended function. If you're looking for a Skein or Threefish source code,
please consult the original file.
The original file had the following header:
**************************************************************************
**
** Interface declarations and internal definitions for Skein hashing.
**
** Source code author: Doug Whiting, 2008.
**
** This algorithm and source code is released to the public domain.
**
***************************************************************************
*/
/* See comment at the top of philox.h for the macro pre-process
strategy. */
/* Rotation constants: */
enum r123_enum_threefry64x4 {
/* These are the R_256 constants from the Threefish reference sources
with names changed to R_64x4... */
R_64x4_0_0=14, R_64x4_0_1=16,
R_64x4_1_0=52, R_64x4_1_1=57,
R_64x4_2_0=23, R_64x4_2_1=40,
R_64x4_3_0= 5, R_64x4_3_1=37,
R_64x4_4_0=25, R_64x4_4_1=33,
R_64x4_5_0=46, R_64x4_5_1=12,
R_64x4_6_0=58, R_64x4_6_1=22,
R_64x4_7_0=32, R_64x4_7_1=32
};
enum r123_enum_threefry64x2 {
/*
// Output from skein_rot_search: (srs64_B64-X1000)
// Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
// Start: Tue Mar 1 10:07:48 2011
// rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
*/
R_64x2_0_0=16,
R_64x2_1_0=42,
R_64x2_2_0=12,
R_64x2_3_0=31,
R_64x2_4_0=16,
R_64x2_5_0=32,
R_64x2_6_0=24,
R_64x2_7_0=21
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
// 5 rounds: minHW = 8 [ 8 8 8 8 ]
// 6 rounds: minHW = 16 [ 16 16 16 16 ]
// 7 rounds: minHW = 32 [ 32 32 32 32 ]
// 8 rounds: minHW = 64 [ 64 64 64 64 ]
// 9 rounds: minHW = 64 [ 64 64 64 64 ]
//10 rounds: minHW = 64 [ 64 64 64 64 ]
//11 rounds: minHW = 64 [ 64 64 64 64 ] */
};
enum r123_enum_threefry32x4 {
/* Output from skein_rot_search: (srs-B128-X5000.out)
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
// Start: Mon Aug 24 22:41:36 2009
// ...
// rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
R_32x4_0_0=10, R_32x4_0_1=26,
R_32x4_1_0=11, R_32x4_1_1=21,
R_32x4_2_0=13, R_32x4_2_1=27,
R_32x4_3_0=23, R_32x4_3_1= 5,
R_32x4_4_0= 6, R_32x4_4_1=20,
R_32x4_5_0=17, R_32x4_5_1=11,
R_32x4_6_0=25, R_32x4_6_1=10,
R_32x4_7_0=18, R_32x4_7_1=20
/* 4 rounds: minHW = 3 [ 3 3 3 3 ]
// 5 rounds: minHW = 7 [ 7 7 7 7 ]
// 6 rounds: minHW = 12 [ 13 12 13 12 ]
// 7 rounds: minHW = 22 [ 22 23 22 23 ]
// 8 rounds: minHW = 31 [ 31 31 31 31 ]
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
//10 rounds: minHW = 32 [ 32 32 32 32 ]
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
};
enum r123_enum_threefry32x2 {
/* Output from skein_rot_search (srs32x2-X5000.out)
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
// Start: Tue Jul 12 11:11:33 2011
// rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
R_32x2_0_0=13,
R_32x2_1_0=15,
R_32x2_2_0=26,
R_32x2_3_0= 6,
R_32x2_4_0=17,
R_32x2_5_0=29,
R_32x2_6_0=16,
R_32x2_7_0=24
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
// 5 rounds: minHW = 6 [ 6 8 6 8 ]
// 6 rounds: minHW = 9 [ 9 12 9 12 ]
// 7 rounds: minHW = 16 [ 16 24 16 24 ]
// 8 rounds: minHW = 32 [ 32 32 32 32 ]
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
//10 rounds: minHW = 32 [ 32 32 32 32 ]
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
};
enum r123_enum_threefry_wcnt {
WCNT2=2,
WCNT4=4
};
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
{
return (x << (N & 63)) | (x >> ((64-N) & 63));
}
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
{
return (x << (N & 31)) | (x >> ((32-N) & 31));
}
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
#define SKEIN_KS_PARITY32 0x1BD11BDA
#ifndef THREEFRY2x32_DEFAULT_ROUNDS
#define THREEFRY2x32_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY2x64_DEFAULT_ROUNDS
#define THREEFRY2x64_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY4x32_DEFAULT_ROUNDS
#define THREEFRY4x32_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY4x64_DEFAULT_ROUNDS
#define THREEFRY4x64_DEFAULT_ROUNDS 20
#endif
#define _threefry2x_tpl(W) \
typedef struct r123array2x##W threefry2x##W##_ctr_t; \
typedef struct r123array2x##W threefry2x##W##_key_t; \
typedef struct r123array2x##W threefry2x##W##_ukey_t; \
R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
threefry2x##W##_ctr_t X; \
uint##W##_t ks[2+1]; \
int i; /* avoid size_t to avoid need for stddef.h */ \
R123_ASSERT(Nrounds<=32); \
ks[2] = SKEIN_KS_PARITY##W; \
for (i=0;i < 2; i++) \
{ \
ks[i] = k.v[i]; \
X.v[i] = in.v[i]; \
ks[2] ^= k.v[i]; \
} \
\
/* Insert initial key before round 0 */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; \
\
if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>3){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; \
X.v[1] += 1; /* X.v[2-1] += r */ \
} \
if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>7){ \
/* InjectKey(r=2) */ \
X.v[0] += ks[2]; X.v[1] += ks[0]; \
X.v[1] += 2; \
} \
if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>11){ \
/* InjectKey(r=3) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; \
X.v[1] += 3; \
} \
if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>15){ \
/* InjectKey(r=4) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; \
X.v[1] += 4; \
} \
if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>19){ \
/* InjectKey(r=5) */ \
X.v[0] += ks[2]; X.v[1] += ks[0]; \
X.v[1] += 5; \
} \
if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>23){ \
/* InjectKey(r=6) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; \
X.v[1] += 6; \
} \
if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>27){ \
/* InjectKey(r=7) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; \
X.v[1] += 7; \
} \
if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>31){ \
/* InjectKey(r=8) */ \
X.v[0] += ks[2]; X.v[1] += ks[0]; \
X.v[1] += 8; \
} \
return X; \
} \
/** @ingroup ThreefryNxW */ \
enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
}
#define _threefry4x_tpl(W) \
typedef struct r123array4x##W threefry4x##W##_ctr_t; \
typedef struct r123array4x##W threefry4x##W##_key_t; \
typedef struct r123array4x##W threefry4x##W##_ukey_t; \
R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
threefry4x##W##_ctr_t X; \
uint##W##_t ks[4+1]; \
int i; /* avoid size_t to avoid need for stddef.h */ \
R123_ASSERT(Nrounds<=72); \
ks[4] = SKEIN_KS_PARITY##W; \
for (i=0;i < 4; i++) \
{ \
ks[i] = k.v[i]; \
X.v[i] = in.v[i]; \
ks[4] ^= k.v[i]; \
} \
\
/* Insert initial key before round 0 */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
\
if(Nrounds>0){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>1){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>2){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>3){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>3){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>4){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>5){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>6){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>7){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>7){ \
/* InjectKey(r=2) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>8){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>9){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>10){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>11){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>11){ \
/* InjectKey(r=3) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>12){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>13){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>14){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>15){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>15){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>16){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>17){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>18){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>19){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>19){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>20){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>21){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>22){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>23){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>23){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>24){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>25){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>26){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>27){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>27){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>28){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>29){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>30){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>31){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>31){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>32){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>33){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>34){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>35){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>35){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>36){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>37){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>38){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>39){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>39){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>40){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>41){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>42){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>43){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>43){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>44){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>45){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>46){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>47){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>47){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>48){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>49){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>50){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>51){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>51){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>52){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>53){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>54){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>55){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>55){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>56){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>57){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>58){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>59){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>59){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>60){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>61){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>62){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>63){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>63){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>64){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>65){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>66){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>67){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>67){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>68){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>69){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>70){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>71){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>71){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \
} \
\
return X; \
} \
/** @ingroup ThreefryNxW */ \
enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
}
/** \endcond */
_threefry2x_tpl(64)
_threefry2x_tpl(32)
_threefry4x_tpl(64)
_threefry4x_tpl(32)
/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
than a static inline function. Why? */
#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
#ifdef SWIFT_DEBUG_CHECKS
#ifdef __cplusplus
/** \cond HIDDEN_FROM_DOXYGEN */
#define _threefryNxWclass_tpl(NxW) \
namespace r123{ \
template<unsigned int R> \
struct Threefry##NxW##_R{ \
typedef threefry##NxW##_ctr_t ctr_type; \
typedef threefry##NxW##_key_t key_type; \
typedef threefry##NxW##_key_t ukey_type; \
static const unsigned int rounds=R; \
inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
return threefry##NxW##_R(R, ctr, key); \
} \
}; \
typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
} // namespace r123
/** \endcond */
_threefryNxWclass_tpl(2x32)
_threefryNxWclass_tpl(4x32)
_threefryNxWclass_tpl(2x64)
_threefryNxWclass_tpl(4x64)
/* The _tpl macros don't quite work to do string-pasting inside comments.
so we just write out the boilerplate documentation four times... */
/**
@defgroup ThreefryNxW Threefry Classes and Typedefs
The ThreefryNxW classes export the member functions, typedefs and
operator overloads required by a @ref CBRNG "CBRNG" class.
As described in
<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>,
the Threefry family is closely related to the Threefish block cipher from
<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
Threefry is \b not suitable for cryptographic use.
Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
@class r123::Threefry2x32_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=13 or more for Threefry2x32.
@typedef r123::Threefry2x32
@ingroup ThreefryNxW
Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds,
Threefry2x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry2x64_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
In November 2011, the authors discovered that 13 rounds of
Threefry2x64 sequenced by strided, interleaved key and counter
increments failed a very long (longer than the default BigCrush
length) WeightDistrub test. At the same time, it was confirmed that
14 rounds passes much longer tests (up to 5x10^12 samples) of a
similar nature. The authors know of no statistical flaws with
ROUNDS=14 or more for Threefry2x64.
@typedef r123::Threefry2x64
@ingroup ThreefryNxW
Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds,
Threefry2x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry4x32_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=12 or more for Threefry4x32.
@typedef r123::Threefry4x32
@ingroup ThreefryNxW
Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds,
Threefry4x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry4x64_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=12 or more for Threefry4x64.
@typedef r123::Threefry4x64
@ingroup ThreefryNxW
Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds,
Threefry4x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
*/
#endif
#endif
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment