/******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // @author raver119@gmail.com // #ifndef LIBND4J_AGGREGATE_OPS_H #define LIBND4J_AGGREGATE_OPS_H #include #include #define HS_MAX_EXP 6.0f #ifdef __CUDACC__ #define aggregate_def __device__ inline static #else #include #define aggregate_def inline static #endif /* * * * Aggregate Ops are special things suited for CUDA mostly. They are meant to be executed within single block ONLY. * So, when batched, they should provide proper parallelism levels on poorly parallel tasks otherwise. * * On CPU aggregate ops are trying to minimize OpenMP multi-threading use, only SIMD is enforced * * */ namespace aggregateOps { template class GEMM { public: #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { // no-op } #endif #ifndef __CUDACC__ static CBLAS_ORDER convertOrder(int from) { switch(from) { //'c' case 99: return CblasRowMajor; //'C' case 67: return CblasRowMajor; //'f' case 102: return CblasColMajor; //'F' case 70: return CblasColMajor; default: return CblasColMajor; } } static CBLAS_TRANSPOSE convertTranspose(int from) { switch(from) { //'t' case 116: return CblasTrans; //'T' case 84: return CblasTrans; //'n' case 110: return CblasNoTrans; //'N' case 78: return CblasNoTrans; //'c' case 99: return CblasConjTrans; //'C' case 67: return CblasConjTrans; default: return CblasNoTrans; } } #endif #ifndef __CUDACC__ aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { int M = indexArguments[0]; int N = indexArguments[1]; int K = indexArguments[2]; int lda = indexArguments[3]; int ldb = indexArguments[4]; int ldc = indexArguments[5]; int TransA = indexArguments[6]; int TransB = indexArguments[7]; int Order = indexArguments[8]; T alpha = realArguments[0]; T beta = realArguments[1]; T *A = arguments[0]; T *B = arguments[1]; T *C = arguments[2]; nd4j::blas::GEMM::op(convertOrder(Order), convertTranspose(TransA), convertTranspose(TransB),M,N,K,(T) alpha,A,lda,B,ldb,(T) beta,C,ldc); } #else aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { // stub for nvcc } #endif }; /** * We don't include this class into ops directly, since it won't be ever used directly, * Only as part of SkipGram or CBOW */ template class HierarchicSoftmax { private: public: aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { int vectorLength = indexArguments[0]; int expLength = indexArguments[1]; int code = indexArguments[2]; int isInference = indexArguments[3]; T *syn0 = arguments[0]; // we pass row pointer here T *syn1 = arguments[1]; // we pass row pointer here T *expTable = arguments[2]; T *neu1e = arguments[3]; T dot(0.0f); T g(0.0f); T f(0.0f); T alpha = realArguments[0]; //nd4j_printf("Vector length: [%i]; expLength: [%i]; Code: [%i]; Inf: [%i]\n", vectorLength, expLength, code, isInference); // shape::printArray(syn0, vectorLength, "syn0"); // shape::printArray(syn1, vectorLength, "syn1"); // shape::printArray(neu1e, vectorLength, "neu1e"); // dot for (int x = 0; x < vectorLength; x++) { dot += syn0[x] * syn1[x]; } // gradient if (dot < (T) - HS_MAX_EXP || dot >= (T) HS_MAX_EXP) { return; } int idx = static_cast((dot + HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0f)); if (idx >= expLength || idx < 0) { return; } f = expTable[idx]; g = (static_cast(1.0f) - static_cast(code) - f) * alpha; //nd4j_printf("dot: [%f]; idx: [%i]; f: [%f]; g: [%f]\n", (float) dot, idx, (float) f, (float) g); // axpy1 PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { neu1e[x] = g * syn1[x] + neu1e[x]; } // axpy2 if (!isInference) { PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { syn1[x] = g * syn0[x] + syn1[x]; } } } #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { /* We know that syn0 & syn1 are 2D matrices, so we can just use offsets here */ __shared__ int vectorLength; __shared__ int expLength; __shared__ int code; __shared__ int isInference; T *syn0 = arguments[0]; T *syn1 = arguments[1]; T *expTable = arguments[2]; T *neu1e = arguments[3]; __shared__ T dot; __shared__ T g; __shared__ T f; __shared__ T alpha; if (threadIdx.x == 0) { vectorLength = indexArguments[0]; expLength = indexArguments[1]; code = indexArguments[2]; isInference = indexArguments[3]; dot = (T) 0.0f; alpha = realArguments[0]; } __syncthreads(); // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that // dot for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { T prod = syn0[x] * syn1[x]; nd4j::math::atomics::nd4j_atomicAdd(&dot, prod); } // gradient __syncthreads(); if (dot < - (T) HS_MAX_EXP || dot >= (T) HS_MAX_EXP) return; int idx = (int) ((dot + HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0)); if (idx >= expLength) return; if (threadIdx.x == 0) { // gradient calculation f = expTable[idx]; g = ((T) 1.0f - (T) code - f) * alpha; } __syncthreads(); // axpy1 for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { neu1e[x] = g * syn1[x] + neu1e[x]; } // axpy2 if (!isInference) for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { syn1[x] = g * syn0[x] + syn1[x]; } } #endif }; /** * We don't include this class into ops directly, since it won't be ever used directly, * Only as part of SkipGram or CBOW */ template class NegativeSampling { public: aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { int vectorLength = indexArguments[0]; int expLength = indexArguments[1]; int code = indexArguments[2]; int isInference = indexArguments[3]; T *syn0 = arguments[0]; // we pass row pointer here T *syn1Neg = arguments[1]; // we pass row pointer here T *expTable = arguments[2]; T *neu1e = arguments[3]; T dot = (T) 0.0f; T g = (T) 0.0f; T alpha = realArguments[0]; // dot for (int x = 0; x < vectorLength; x++) { dot += syn0[x] * syn1Neg[x]; } if (dot > HS_MAX_EXP) g = (code - 1) * alpha; else if (dot < (T) - HS_MAX_EXP) g = (code - 0) * alpha; else { int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0)); if (idx >= expLength) return; if (idx < 0) return; g = ((T) code - expTable[idx]) * alpha; } // axpy1 PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { neu1e[x] = g * syn1Neg[x] + neu1e[x]; } // axpy2 if (!isInference) { PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { syn1Neg[x] = g * syn0[x] + syn1Neg[x]; } } } #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { /* We know that syn0 & syn1 are 2D matrices, so we can just use offsets here */ __shared__ int vectorLength; __shared__ int expLength; __shared__ int code; __shared__ int isInference; T *syn0 = arguments[0]; T *syn1Neg = arguments[1]; T *expTable = arguments[2]; T *neu1e = arguments[3]; __shared__ T dot; __shared__ T g; __shared__ T alpha; if (threadIdx.x == 0) { vectorLength = indexArguments[0]; expLength = indexArguments[1]; code = indexArguments[2]; isInference = indexArguments[3]; dot = (T) 0.0f; alpha = realArguments[0]; } __syncthreads(); // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that // dot for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { T prod = syn0[x] * syn1Neg[x]; nd4j::math::atomics::nd4j_atomicAdd(&dot, prod); } // gradient __syncthreads(); int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0)); if (idx >= expLength && dot <= (T) HS_MAX_EXP && dot >= (T) -HS_MAX_EXP) return; if (threadIdx.x == 0) { // gradient calculation if (dot > (T) HS_MAX_EXP) g = (code - 1) * alpha; else if (dot < (T) - HS_MAX_EXP) g = (code - 0) * alpha; else { g = ((T) code - expTable[idx]) * alpha; } // printf("dot: [%f]; g: [%f]\n", dot, g); } __syncthreads(); // printf("before syn1Neg[%i]: [%f], dot: [%f]; g: [%f]; vectorLength: [%i]\n", threadIdx.x, syn1Neg[threadIdx.x], dot, g, vectorLength); // axpy1 for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { neu1e[x] = g * syn1Neg[x] + neu1e[x]; } // axpy2 if (!isInference) for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { syn1Neg[x] = g * syn0[x] + syn1Neg[x]; } // printf("after syn1Neg[%i]: [%f]\n", threadIdx.x, syn1Neg[threadIdx.x]); } #endif }; template class Dot { public: aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { T *vecX = arguments[0]; T *vecY = arguments[1]; T *vecZ = arguments[2]; T dot = (T) 0.0f; int vectorLength = indexArguments[0]; PRAGMA_OMP_SIMD_SUM(dot) for (int x = 0; x < vectorLength; x++) { dot += vecX[x] * vecY[x]; } vecZ[0] = dot; }; #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { T *vecX = arguments[0]; T *vecY = arguments[1]; T *vecZ = arguments[2]; int vectorLength = indexArguments[0]; __shared__ T dot; if (threadIdx.x == 0) dot = (T) 0.0f; __syncthreads(); for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { T prod = vecX[x] * vecY[x]; nd4j::math::atomics::nd4j_atomicAdd(&dot, prod); } __syncthreads(); if (threadIdx.x == 0) vecZ[0] = dot; } #endif }; template class Axpy { public: aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { T *vecX = arguments[0]; T *vecY = arguments[1]; T alpha = realArguments[0]; int vectorLength = indexArguments[0]; PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { vecY[x] = alpha * vecX[x] + vecY[x]; } }; #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { T *vecX = arguments[0]; T *vecY = arguments[1]; T alpha = realArguments[0]; int vectorLength = indexArguments[0]; for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { vecY[x] = alpha * vecX[x] + vecY[x]; } __syncthreads(); } #endif }; template class SkipGram { public: aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { int syn0Row = indexArguments[0]; int vectorLength = indexArguments[1]; int hsRounds = indexArguments[2]; int ngRounds = indexArguments[3]; int expLength = indexArguments[4]; int vocabSize = indexArguments[5]; int ngStarter = indexArguments[6]; int negTableLength = indexArguments[7]; int isInference = indexArguments[8]; auto neu1e = new T[vectorLength]; std::memset(neu1e, 0, sizeof(T) * vectorLength); T *args[4]; int idxArgs[4]; args[1] = arguments[1]; // syn1 args[2] = arguments[2]; // expTable args[3] = neu1e; idxArgs[0] = vectorLength; // vectorLength idxArgs[1] = expLength; // expLength idxArgs[3] = isInference; T *syn1Neg = arguments[3]; T *negTable = arguments[4]; T *inferenceVector = arguments[5]; T *syn0 = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength); args[0] = syn0;// syn0 int *idxSyn1 = intArrays[0]; int *codes = intArrays[1]; //nd4j_printf("syn0Row: [%i]; vecLen: [%i]; hsRounds: [%i]; ngRounds: [%i]; expLength: [%i]; vocabSize: [%i]; ngStarter: [%i]; negTableLength: [%i]; isInf: [%i]\n", syn0Row, vectorLength, hsRounds, ngRounds, expLength, vocabSize, ngStarter, negTableLength, isInference); auto next_random = static_cast(realArguments[1]); if (hsRounds > 0) { for (int r = 0; r < hsRounds; r++) { args[1] = arguments[1] + (idxSyn1[r] * vectorLength); // syn1 row idxArgs[2] = codes[r]; // code for row //nd4j_printf("idx syn1: [%i]; code: [%i]\n", idxSyn1[r], idxArgs[2]); HierarchicSoftmax::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1); } } int target = ngStarter; if (ngRounds > 0) { for (int r = 0; r < ngRounds + 1; r++) { if (r == 0) { idxArgs[2] = 1; } else { next_random = next_random * (unsigned long long) 25214903917 + 11; target = negTable[(next_random >> 16) % negTableLength]; if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; if (target == ngStarter) continue; idxArgs[2] = 0; } args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1 NegativeSampling::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1); } } //nd4j_printf("applying...\n",""); if (!isInference) { PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { syn0[x] += neu1e[x]; } } else { PRAGMA_OMP_SIMD for (int x = 0; x < vectorLength; x++) { inferenceVector[x] += neu1e[x]; } } delete[] neu1e; } #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { __shared__ int syn0Row; __shared__ int vectorLength; __shared__ int hsRounds; __shared__ int ngRounds; __shared__ int expLength; __shared__ int vocabSize; __shared__ int ngStarter; __shared__ int negTableLength; __shared__ int isInference; __shared__ T *neu1e; __shared__ T *args[4]; __shared__ int idxArgs[4]; __shared__ unsigned long long next_random; __shared__ T *negTable; T *syn1Neg = arguments[3]; __shared__ T *inferenceVector; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; neu1e = (T *) shmem; syn0Row = indexArguments[0]; vectorLength = indexArguments[1]; hsRounds = indexArguments[2]; ngRounds = indexArguments[3]; expLength = indexArguments[4]; vocabSize = indexArguments[5]; ngStarter = indexArguments[6]; negTableLength = indexArguments[7]; isInference = indexArguments[8]; inferenceVector = arguments[5]; next_random = (unsigned long long) realArguments[1]; args[0] = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength); // syn0 args[1] = arguments[1]; // syn1 args[2] = arguments[2]; // expTable args[3] = neu1e; negTable = arguments[4]; idxArgs[0] = vectorLength; // vectorLength idxArgs[1] = expLength; // expLength idxArgs[3] = isInference; } __syncthreads(); T *syn0 = isInference ? inferenceVector : arguments[0] + (syn0Row * vectorLength); for (int i = threadIdx.x; i < vectorLength; i+=blockDim.x) { neu1e[i] = (T) 0.0f; } int *idxSyn1 = intArrays[0]; int *codes = intArrays[1]; for (int r = 0; r < hsRounds; r++) { if (threadIdx.x == 0) { args[1] = arguments[1] + (idxSyn1[r] * vectorLength);// syn1 row idxArgs[2] = codes[r]; // code for row } __syncthreads(); HierarchicSoftmax::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 1); } __syncthreads(); __shared__ int target; if (ngRounds > 0) for (int r = 0; r < ngRounds + 1; r++) { if (threadIdx.x == 0) { if (r == 0) { // this line isn't a mistake target = ngStarter; idxArgs[2] = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11 + blockIdx.x; target = negTable[(next_random >> 16) % negTableLength]; if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; idxArgs[2] = 0; } args[1] = syn1Neg + (target * vectorLength); } __syncthreads(); // we put it here, to make sure all threads pick up continue call if (r != 0 && target == ngStarter) continue; NegativeSampling::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 1); } // final axpy with 1.0f as alpha if (!isInference) for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) { syn0[x] += neu1e[x]; } else for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) { inferenceVector[x] += neu1e[x]; } } #endif }; template class CBOW { public: aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { int vectorLength = indexArguments[0]; int hsRounds = indexArguments[1]; int ngRounds = indexArguments[2]; int expLength = indexArguments[3]; int vocabSize = indexArguments[4]; int ngStarter = indexArguments[5]; int negTableLength = indexArguments[6]; int idxSyn0Length = indexArguments[7]; //int initialIdx = indexArguments[8]; int numLabels = indexArguments[9]; int trainWords = indexArguments[10]; int isInference = indexArguments[11]; int *idxSyn0 = intArrays[0]; int *idxSyn1 = intArrays[1]; int *codes = intArrays[2]; T *neu1 = new T[vectorLength]; T *neu1e = new T[vectorLength]; std::memset(neu1, 0, sizeof(T) * vectorLength); std::memset(neu1e, 0, sizeof(T) * vectorLength); T *syn0 = arguments[0]; T *syn1 = arguments[1]; T *expTable = arguments[2]; T *syn1Neg = arguments[3]; T *negTable = arguments[4]; T *inferenceVector = arguments[5]; T *args[4]; int idxArgs[4]; idxArgs[0] = vectorLength; // vectorLength idxArgs[1] = expLength; // expLength idxArgs[3] = isInference; unsigned long long next_random = (unsigned long long) realArguments[1]; // building neu1 for current window for (int c = 0; c < idxSyn0Length; c++) { T *syn0word = syn0 + (idxSyn0[c] * vectorLength); PRAGMA_OMP_SIMD for (int i = 0; i < vectorLength; i++) { neu1[i] += syn0word[i]; } } // for inference we use additional inference vector if (isInference) { PRAGMA_OMP_SIMD for (int i = 0; i < vectorLength; i++) { neu1[i] += inferenceVector[i]; } } // average neu1 if (idxSyn0Length > 0) { PRAGMA_OMP_SIMD for (int i = 0; i < vectorLength; i++) { neu1[i] /= idxSyn0Length + isInference; } } args[0] = neu1; args[2] = expTable; args[3] = neu1e; if (hsRounds > 0) for (int i = 0; i < hsRounds; i++) { args[1] = syn1 + (idxSyn1[i] * vectorLength); idxArgs[2] = codes[i]; HierarchicSoftmax::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); } int target = ngStarter; if (ngRounds > 0) for (int i = 0; i < ngRounds + 1; i++) { if (i == 0) { idxArgs[2] = 1; } else { next_random = next_random * (unsigned long long) 25214903917 + 11; target = negTable[(next_random >> 16) % negTableLength]; if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; if (target == ngStarter) continue; idxArgs[2] = 0; } args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1 //printf("Negative round: target: [%i]; code: [%i]; neu1e[0]: [%f]\n", target, idxArgs[4], neu1e[0]); NegativeSampling::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); } // if we don't train words - we skip start of idxSyn0 int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels; // propagate neu1e -> syn0 if (!isInference) { for (int c = starter; c < idxSyn0Length; c++) { T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength); PRAGMA_OMP_SIMD for (int i = 0; i < vectorLength; i++) { syn0word[i] += neu1e[i]; } } } else { PRAGMA_OMP_SIMD for (int i = 0; i < vectorLength; i++) { inferenceVector[i] += neu1e[i]; } } delete[] neu1; delete[] neu1e; } #ifdef __CUDACC__ aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { __shared__ int vectorLength; __shared__ int hsRounds; __shared__ int ngRounds; __shared__ int expLength; __shared__ int vocabSize; __shared__ int ngStarter; __shared__ int negTableLength; __shared__ int idxSyn0Length; __shared__ int initialIdx; __shared__ int numLabels; __shared__ int trainWords; __shared__ int isInference; int *idxSyn0 = intArrays[0]; int *idxSyn1 = intArrays[1]; int *codes = intArrays[2]; __shared__ T *neu1; __shared__ T *neu1e; __shared__ T *args[5]; __shared__ int idxArgs[4]; T *syn0 = arguments[0]; T *syn1 = arguments[1]; //T *expTable = arguments[2]; T *syn1Neg = arguments[3]; T *negTable = arguments[4]; T *inferenceVector = arguments[5]; if (threadIdx.x == 0) { vectorLength = indexArguments[0]; hsRounds = indexArguments[1]; ngRounds = indexArguments[2]; expLength = indexArguments[3]; vocabSize = indexArguments[4]; ngStarter = indexArguments[5]; negTableLength = indexArguments[6]; idxSyn0Length = indexArguments[7]; initialIdx = indexArguments[8]; numLabels = indexArguments[9]; trainWords = indexArguments[10]; isInference = indexArguments[11]; extern __shared__ unsigned char shmem[]; neu1 = (T *) shmem; neu1e = neu1 + vectorLength; args[0] = neu1; args[2] = arguments[2]; //expTable args[3] = neu1e; idxArgs[0] = vectorLength; // vectorLength idxArgs[1] = expLength; // expLength idxArgs[3] = isInference; } __syncthreads(); for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { neu1[i] = (T) 0.0f; neu1e[i] = (T) 0.0f; } unsigned long long next_random = (unsigned long long) realArguments[1]; for (int c = 0; c < idxSyn0Length; c++) { T *syn0word = syn0 + (idxSyn0[c] * vectorLength); for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { neu1[i] += syn0word[i]; } } if (isInference) for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { neu1[i] += inferenceVector[i]; } // average neu1 if (idxSyn0Length > 0) { for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { neu1[i] /= idxSyn0Length + + isInference; } } __syncthreads(); if (hsRounds > 0) for (int i = 0; i < hsRounds; i++) { if (threadIdx.x == 0) { args[1] = syn1 + (idxSyn1[i] * vectorLength); idxArgs[2] = codes[i]; } __syncthreads(); HierarchicSoftmax::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); } __shared__ int target; if (ngRounds > 0) for (int i = 0; i < ngRounds + 1; i++) { if (threadIdx.x == 0) { if (i == 0) { target = ngStarter; } else { next_random = next_random * (unsigned long long) 25214903917 + 11; target = negTable[(next_random >> 16) % negTableLength]; if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; } args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1 idxArgs[2] = i == 0 ? 1 : 0; } __syncthreads(); if (i != 0 && target == ngStarter) continue; NegativeSampling::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); //printf("Negative round: target: [%i]; code: [%i]; neu1[%i]: [%f]; neu1e[%i]: [%f]\n", target, idxArgs[2], threadIdx.x, neu1[threadIdx.x], threadIdx.x, neu1e[threadIdx.x]); } // if we don't train words - we skip start of idxSyn0 int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels; if (!isInference) for (int c = starter; c < idxSyn0Length; c++) { T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength); for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { syn0word[i] += neu1e[i]; } } else { for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { inferenceVector[i] += neu1e[i]; } } } #endif }; } #endif //LIBND4J_AGGREGATE_OPS_H