DeepLearningExamples/FasterTransformer/v3.1/fastertransformer/cuda/decoding_kernel_check.cpp
byshiue ae76b894b9
Byshiue patch 2 (#788)
[FasterTransformer] feat: Update FasterTransformer v3.1
2020-12-14 07:28:11 +08:00

437 lines
18 KiB
C++

/*
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "decoding_kernel_check.h"
namespace fastertransformer
{
void init_kernel_check(bool *d_finished, int *d_sequence_length, int *d_word_ids, float *d_cum_log_probs, const int sentence_id, const int batch_size,
const int beam_width, cudaStream_t stream)
{
printf("[INFO] decoding init check. \n");
bool *h_finished = new bool[batch_size * beam_width];
int *h_sequence_length = new int[batch_size * beam_width];
int *h_word_ids = new int[batch_size * beam_width];
float *h_cum_log_probs = new float[batch_size * beam_width];
init_kernelLauncher(d_finished, d_sequence_length, d_word_ids, d_cum_log_probs,
sentence_id, batch_size, beam_width, stream);
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaMemcpy(h_finished, d_finished, sizeof(bool) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_sequence_length, d_sequence_length, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_word_ids, d_word_ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_cum_log_probs, d_cum_log_probs, sizeof(float) * batch_size * beam_width, cudaMemcpyDeviceToHost));
bool *h_finished_cpu = new bool[batch_size * beam_width];
int *h_sequence_length_cpu = new int[batch_size * beam_width];
int *h_word_ids_cpu = new int[batch_size * beam_width];
float *h_cum_log_probs_cpu = new float[batch_size * beam_width];
for (int i = 0; i < batch_size * beam_width; i++)
{
h_finished_cpu[i] = false;
h_sequence_length_cpu[i] = 0;
h_word_ids_cpu[i] = sentence_id;
if (i % beam_width == 0)
h_cum_log_probs_cpu[i] = 0.0f;
else
h_cum_log_probs_cpu[i] = -1e20f;
}
for (int i = 0; i < batch_size * beam_width; i++)
{
if (h_finished[i] != h_finished_cpu[i])
{
printf("[ERROR] finished initialize fail. \n");
exit(-1);
}
if (h_sequence_length[i] != h_sequence_length_cpu[i])
{
printf("[ERROR] sequence length initialize fail. \n");
exit(-1);
}
if (h_word_ids[i] != h_word_ids_cpu[i])
{
printf("[ERROR] %d kernel word is: %d, cpu word is: %d \n", i, h_word_ids[i], h_word_ids_cpu[i]);
printf("[ERROR] word ids initialize fail. \n");
exit(-1);
}
if (h_cum_log_probs[i] != h_cum_log_probs_cpu[i])
{
printf("[ERROR] cum log probs initialize fail. \n");
exit(-1);
}
}
delete[] h_cum_log_probs_cpu;
delete[] h_word_ids_cpu;
delete[] h_sequence_length_cpu;
delete[] h_finished_cpu;
delete[] h_cum_log_probs;
delete[] h_word_ids;
delete[] h_sequence_length;
delete[] h_finished;
printf("[INFO] decoding init check Finish. \n");
}
void update_logits_kernel_check(float *logits, const float *bias, const int end_id, const bool *finished, const int m, const int n, cudaStream_t stream)
{
// m: batch_size * beam_width
// n: vocab size
printf("[INFO] decoding update logits check. \n");
float *h_logits = new float[m * n];
float *h_logits_after_update = new float[m * n];
float *h_logits_after_update_cpu = new float[m * n];
float *h_bias = new float[n];
bool *h_finished = new bool[m];
check_cuda_error(cudaMemcpy(h_logits, logits, sizeof(float) * m * n, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_bias, bias, sizeof(float) * n, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_finished, finished, sizeof(bool) * m, cudaMemcpyDeviceToHost));
update_logits(logits, bias, end_id, finished, m, n, stream);
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaMemcpy(h_logits_after_update, logits, sizeof(float) * m * n, cudaMemcpyDeviceToHost));
// update logits in cpu
// add bias
for (int i = 0; i < m; i++)
{
if (h_finished[i] == false)
{
for (int j = 0; j < n; j++)
{
h_logits_after_update_cpu[i * n + j] = h_logits[i * n + j] + h_bias[j];
}
}
else
{
for (int j = 0; j < n; j++)
{
h_logits_after_update_cpu[i * n + j] = ((j == end_id) ? FLT_MAX : -1 * FLT_MAX);
}
}
}
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
h_logits_after_update_cpu[i * n + j] = h_logits[i * n + j] + h_bias[j];
}
}
// compute log_softmax
for (int i = 0; i < m; i++)
{
//
// reduce max
float max = -1 * FLT_MAX;
for (int j = 0; j < n; j++)
{
float val = h_logits_after_update_cpu[i * n + j];
if (val > max)
max = val;
}
// minus the max value to prevent overflow, and compute the exponential
float sum = 0.0f;
for (int j = 0; j < n; j++)
{
h_logits_after_update_cpu[i * n + j] = expf((float)h_logits_after_update_cpu[i * n + j] - max);
sum = sum + (float)h_logits_after_update_cpu[i * n + j];
}
for (int j = 0; j < n; j++)
{
h_logits_after_update_cpu[i * n + j] = logf((float)h_logits_after_update_cpu[i * n + j] / sum);
}
}
// check the logits
for (int i = 0; i < m * n; i++)
{
float diff = (float)(h_logits_after_update_cpu[i] - h_logits_after_update[i]);
if (diff < 0)
diff = diff * -1.0;
if (diff > 2e-5)
{
printf("[ERROR] update logits fail on %d with | %f - %f | = %f. \n", i, (float)h_logits_after_update_cpu[i], (float)h_logits_after_update[i], diff);
exit(-1);
}
}
delete[] h_logits;
delete[] h_logits_after_update;
delete[] h_logits_after_update_cpu;
delete[] h_bias;
delete[] h_finished;
printf("[INFO] decoding update logits check finish. \n");
}
void broadcast_kernel_check(float *log_probs, float *cum_log_probs, const int batch_size, const int beam_width,
const int vocab_size, cudaStream_t stream)
{
printf("[INFO] decoding broacast check. \n");
float *h_log_probs = new float[batch_size * beam_width * vocab_size];
float *h_cum_log_probs = new float[batch_size * beam_width];
float *h_log_probs_after_update = new float[batch_size * beam_width * vocab_size];
float *h_log_probs_after_update_cpu = new float[batch_size * beam_width * vocab_size];
check_cuda_error(cudaMemcpy(h_log_probs, log_probs, sizeof(float) * batch_size * beam_width * vocab_size, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_cum_log_probs, cum_log_probs, sizeof(float) * batch_size * beam_width, cudaMemcpyDeviceToHost));
broadcast_kernelLauncher(log_probs, cum_log_probs, batch_size, beam_width, vocab_size, stream);
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaMemcpy(h_log_probs_after_update, log_probs, sizeof(float) * batch_size * beam_width * vocab_size, cudaMemcpyDeviceToHost));
for (int i = 0; i < batch_size * beam_width; i++)
{
for (int j = 0; j < vocab_size; j++)
{
h_log_probs_after_update_cpu[i * vocab_size + j] = h_log_probs[i * vocab_size + j] + h_cum_log_probs[i];
}
}
// check the logits
for (int i = 0; i < batch_size * beam_width * vocab_size; i++)
{
float diff = (float)(h_log_probs_after_update_cpu[i] - h_log_probs_after_update[i]);
if (diff < 0)
diff = diff * -1;
if (diff > 1e-5)
{
printf("[ERROR] broadcast fail on %d with | %f - %f | = %f. \n",
i, (float)h_log_probs_after_update_cpu[i], (float)h_log_probs_after_update[i], diff);
exit(-1);
}
}
delete[] h_log_probs;
delete[] h_cum_log_probs;
delete[] h_log_probs_after_update;
delete[] h_log_probs_after_update_cpu;
printf("[INFO] decoding broacast check finish. \n");
}
void topK_kernel_check(const float *log_probs, int *ids, const int batch_size, const int beam_width, const int vocab_size,
cudaStream_t stream)
{
printf("[INFO] decoding topK check. \n");
float *h_log_probs = new float[batch_size * beam_width * vocab_size];
int *h_ids_after_update = new int[batch_size * beam_width];
int *h_ids_after_update_cpu = new int[batch_size * beam_width];
topK(log_probs, ids, batch_size, beam_width, vocab_size, stream);
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaMemcpy(h_log_probs, log_probs, sizeof(float) * batch_size * beam_width * vocab_size, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_ids_after_update, ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
for (int i = 0; i < batch_size; i++)
{
for (int j = 0; j < beam_width; j++)
{
float max_val = -1 * FLT_MAX;
int max_id = -1;
for (int k = 0; k < beam_width * vocab_size; k++)
{
if (h_log_probs[i * beam_width * vocab_size + k] > max_val)
{
max_id = i * beam_width * vocab_size + k;
max_val = h_log_probs[max_id];
}
}
h_ids_after_update_cpu[i * beam_width + j] = max_id;
h_log_probs[max_id] = -FLT_MAX;
}
}
check_cuda_error(cudaMemcpy(h_log_probs, log_probs, sizeof(float) * batch_size * beam_width * vocab_size, cudaMemcpyDeviceToHost));
// check the topK
for (int i = 0; i < batch_size * beam_width; i++)
{
if (h_ids_after_update[i] != h_ids_after_update_cpu[i])
{
for (int i = 0; i < batch_size * beam_width; i++)
{
printf("[INFO] cpu result: %d %d %f \n", i, h_ids_after_update_cpu[i], (float)(h_log_probs[h_ids_after_update_cpu[i]]));
}
for (int i = 0; i < batch_size * beam_width; i++)
{
printf("[INFO] gpu result: %d %d %f \n", i, h_ids_after_update[i], (float)(h_log_probs[h_ids_after_update[i]]));
}
printf("[WARNING] topK fail on %d with %d (%f) %d (%f). \n", i,
h_ids_after_update_cpu[i], (float)h_log_probs[h_ids_after_update_cpu[i]],
h_ids_after_update[i], (float)h_log_probs[h_ids_after_update[i]]);
if (h_log_probs[h_ids_after_update_cpu[i]] != h_log_probs[h_ids_after_update[i]])
{
printf("[ERROR] topK fail on %d with %d (%f) %d (%f). \n", i,
h_ids_after_update_cpu[i], (float)h_log_probs[h_ids_after_update_cpu[i]],
h_ids_after_update[i], (float)h_log_probs[h_ids_after_update[i]]);
exit(-1);
}
}
}
delete[] h_log_probs;
delete[] h_ids_after_update;
delete[] h_ids_after_update_cpu;
printf("[INFO] decoding topK check finish. \n");
}
void update_kernel_check(float *log_probs, float *cum_log_probs, int *ids, bool *finished, int *parent_ids, int *sequence_length,
int *word_ids, int *output_ids,
const int batch_size, const int beam_width,
const int vocab_size, cudaStream_t stream,
const int end_id, int* finished_count)
{
printf("[INFO] decoding update check. \n");
// CPU inputs
float *h_log_probs = new float[batch_size * beam_width * vocab_size];
int *h_ids = new int[batch_size * beam_width];
bool *h_finished = new bool[batch_size * beam_width];
int *h_parent_ids = new int[batch_size * beam_width];
int *h_sequence_length = new int[batch_size * beam_width];
int *h_output_ids = new int[batch_size * beam_width];
// CPU output
float *h_cum_log_probs_after_update_cpu = new float[batch_size * beam_width];
bool *h_finished_after_update_cpu = new bool[batch_size * beam_width];
int *h_parent_ids_after_update_cpu = new int[batch_size * beam_width];
int *h_sequence_length_after_update_cpu = new int[batch_size * beam_width];
int *h_output_ids_after_update_cpu = new int[batch_size * beam_width];
// GPU output
float *h_cum_log_probs_after_update = new float[batch_size * beam_width];
bool *h_finished_after_update = new bool[batch_size * beam_width];
int *h_parent_ids_after_update = new int[batch_size * beam_width];
int *h_sequence_length_after_update = new int[batch_size * beam_width];
int *h_output_ids_after_update = new int[batch_size * beam_width];
// copy to CPU input
check_cuda_error(cudaMemcpy(h_log_probs, log_probs, sizeof(float) * batch_size * beam_width * vocab_size, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_ids, ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_finished, finished, sizeof(bool) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_parent_ids, parent_ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_sequence_length, sequence_length, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_output_ids, output_ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
// compute on GPU and copy to GPU output
update_kernelLauncher(log_probs, cum_log_probs, finished, parent_ids, sequence_length, word_ids, output_ids,
batch_size, beam_width, vocab_size, stream, end_id, finished_count);
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaMemcpy(h_cum_log_probs_after_update, cum_log_probs, sizeof(float) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_finished_after_update, finished, sizeof(bool) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_parent_ids_after_update, parent_ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_sequence_length_after_update, sequence_length, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_output_ids_after_update, output_ids, sizeof(int) * batch_size * beam_width, cudaMemcpyDeviceToHost));
// compute on CPU
for (int i = 0; i < batch_size * beam_width; i++)
{
if (h_finished[i] == false)
h_sequence_length[i] = h_sequence_length[i] + 1;
else
h_sequence_length[i] = h_sequence_length[i];
}
for (int i = 0; i < batch_size * beam_width; i++)
{
int sample_id = h_ids[i];
int word_id = h_ids[i] % vocab_size;
int beam_indices = h_ids[i] / vocab_size;
h_cum_log_probs_after_update_cpu[i] = h_log_probs[sample_id];
h_finished_after_update_cpu[i] = h_finished[beam_indices];
h_sequence_length_after_update_cpu[i] = h_sequence_length[beam_indices];
h_parent_ids_after_update_cpu[i] = beam_indices;
h_output_ids_after_update_cpu[i] = word_id;
printf("[INFO] sample id %d, word id %d, beam id %d, with log prob: %f \n", sample_id, word_id, beam_indices, (float)h_log_probs[sample_id]);
}
for (int i = 0; i < batch_size * beam_width; i++)
{
if (h_parent_ids_after_update[i] != h_parent_ids_after_update_cpu[i])
{
printf("[ERROR] update %d parent_ids fails: %d %d. \n", i, h_parent_ids_after_update_cpu[i], h_parent_ids_after_update[i]);
exit(0);
}
if (h_output_ids_after_update[i] != h_output_ids_after_update_cpu[i])
{
printf("[ERROR] update %d output_ids fails: %d %d. \n", i, h_output_ids_after_update_cpu[i], h_output_ids_after_update[i]);
exit(0);
}
if (h_cum_log_probs_after_update[i] != h_cum_log_probs_after_update_cpu[i])
{
printf("[ERROR] update %d cum log probs fails: %f %f. \n", i, (float)h_cum_log_probs_after_update_cpu[i], (float)h_cum_log_probs_after_update[i]);
exit(0);
}
if (h_finished_after_update[i] != h_finished_after_update_cpu[i])
{
printf("[ERROR] update %d finished fails: %d %d. \n", i, h_finished_after_update_cpu[i], h_finished_after_update[i]);
exit(0);
}
if (h_sequence_length_after_update[i] != h_sequence_length_after_update_cpu[i])
{
printf("[ERROR] update %d sequence length fails: %d %d. \n", i, h_sequence_length_after_update_cpu[i], h_sequence_length_after_update[i]);
exit(0);
}
}
delete[] h_log_probs;
delete[] h_ids;
delete[] h_finished;
delete[] h_parent_ids;
delete[] h_sequence_length;
delete[] h_output_ids;
delete[] h_cum_log_probs_after_update_cpu;
delete[] h_finished_after_update_cpu;
delete[] h_parent_ids_after_update_cpu;
delete[] h_sequence_length_after_update_cpu;
delete[] h_output_ids_after_update_cpu;
delete[] h_cum_log_probs_after_update;
delete[] h_finished_after_update;
delete[] h_parent_ids_after_update;
delete[] h_sequence_length_after_update;
delete[] h_output_ids_after_update;
printf("[INFO] decoding update check finish. \n");
}
} // end of namespace fastertransformer