Unexpected U/V plane offset with WIndows Media Foundation H264 decoder - h.264

While decoding H264 video via Windows Media Foundation using IMFSourceReader, I am seeing an unexpected y-offset in the U/V plane data. By trial and error, I have found an adjustment that seems to work on all the video sources I've tried, but I'd really like to know if this data layout is expected or documented.
Specifically, sometimes IMFSample::GetTotalLength() returns a total buffer size that is greater than height * stride * 3/2. In that case, the U/V planes are offset from the end of the Y plane data by 2/3 of the extra data, like so:
https://stash.reaper.fm/44192/YV12.png
To be clear, the unexpected part is the N rows of pixels below the Y plane data and above the U plane data. I think all of the other offsets and padding are as expected. Is this data layout expected or documented?
Image before applying adjustment:
https://stash.reaper.fm/44193/before.jpg
Image after applying adjustment:
https://stash.reaper.fm/44194/after.jpg
The video:
https://stash.reaper.fm/44214/johnny.mp4
Here is a complete program that draws the first frame of the video with and without the adjustment. There is no error checking and the YUV to RGB conversion is very basic.
#include <windows.h>
#include <initguid.h>
#include <mfapi.h>
#include <mfidl.h>
#include <mfreadwrite.h>
#include "resource.h"
const int marg=16;
unsigned char clamp(int v)
{
return v < 0 ? 0 : v > 255 ? 255 : v;
}
void yuv_to_rgb(unsigned char *rgb, int y, int u, int v)
{
y -= 16;
u -= 128;
v -= 128;
rgb[3] = 0;
rgb[2] = clamp((y*298 + v*409 + 128) / 256);
rgb[1] = clamp((y*298 - u*100 - v*208 + 128) / 256);
rgb[0] = clamp((y*298 + u*516 + 128) / 256);
}
INT_PTR CALLBACK wndproc(HWND hwndDlg, UINT uMsg, WPARAM wParam, LPARAM lParam)
{
static HBITMAP rawbmp = NULL, adjbmp=NULL;
static unsigned int srcw = 0, srch = 0;
switch (uMsg)
{
case WM_INITDIALOG:
{
MFStartup(MF_VERSION);
IMFSourceReader *reader = NULL;
MFCreateSourceReaderFromURL(L"C:\\Users\\xxx\\Documents\\johnny.mp4", NULL, &reader);
reader->SetStreamSelection(MF_SOURCE_READER_ALL_STREAMS, FALSE);
reader->SetStreamSelection(MF_SOURCE_READER_FIRST_VIDEO_STREAM, TRUE);
IMFMediaType *fmt = NULL;
MFCreateMediaType(&fmt);
fmt->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
fmt->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_YV12);
reader->SetCurrentMediaType(MF_SOURCE_READER_FIRST_VIDEO_STREAM, NULL, fmt);
fmt->Release();
reader->GetCurrentMediaType(MF_SOURCE_READER_FIRST_VIDEO_STREAM, &fmt);
MFGetAttributeSize(fmt, MF_MT_FRAME_SIZE, &srcw, &srch);
fmt->Release();
IMFSample *sample = NULL;
DWORD flags=0;
INT64 readpos=0;
IMFMediaBuffer *buffer = NULL;
DWORD bufsz=0;
reader->ReadSample(MF_SOURCE_READER_FIRST_VIDEO_STREAM, 0, NULL, &flags, &readpos, &sample);
sample->GetTotalLength(&bufsz);
sample->ConvertToContiguousBuffer(&buffer);
sample->Release();
reader->Release();
BITMAPINFO bi = {0};
bi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
bi.bmiHeader.biWidth = srcw;
bi.bmiHeader.biHeight = srch;
bi.bmiHeader.biPlanes = 1;
bi.bmiHeader.biBitCount = 32;
bi.bmiHeader.biCompression = BI_RGB;
unsigned char *raw = NULL, *adj=NULL;
rawbmp = CreateDIBSection(NULL, &bi, DIB_RGB_COLORS, (void**)&raw, NULL, 0);
adjbmp = CreateDIBSection(NULL, &bi, DIB_RGB_COLORS, (void**)&adj, NULL, 0);
IMF2DBuffer *buffer2d = NULL;
BYTE *bptr = NULL;
LONG bstride = 0;
buffer->QueryInterface(&buffer2d);
buffer2d->Lock2D(&bptr, &bstride);
int offs=(bufsz*2/3-srch*bstride); // unexpected
unsigned char *rawptr = raw + srcw*(srch-1)*4;
unsigned char *adjptr = adj + srcw*(srch-1)*4;
unsigned char *yptr = bptr;
unsigned char *uptr = bptr + srch*bstride*5/4;
unsigned char *vptr = bptr + srch*bstride;
for (unsigned int y=0; y < srch; ++y)
{
for (unsigned int x=0; x < srcw; ++x)
{
yuv_to_rgb(rawptr+x*4, yptr[x], uptr[x/2], vptr[x/2]);
yuv_to_rgb(adjptr+x*4, yptr[x], uptr[x/2+offs], vptr[x/2+offs]);
}
rawptr -= srcw*4;
adjptr -= srcw*4;
yptr += bstride;
if (y&1) uptr += bstride/2;
if (y&1) vptr += bstride/2;
}
buffer2d->Unlock2D();
buffer2d->Release();
buffer->Release();
SetWindowPos(hwndDlg, NULL, 0, 0, srcw+2*marg, 2*(srch+2*marg), SWP_NOZORDER|SWP_NOMOVE|SWP_NOACTIVATE);
}
return 0;
case WM_DESTROY:
{
DeleteObject(rawbmp);
DeleteObject(adjbmp);
}
return 0;
case WM_PAINT:
{
RECT r;
GetClientRect(hwndDlg, &r);
int w=r.right, h=r.bottom;
PAINTSTRUCT ps;
HDC dc = BeginPaint(hwndDlg, &ps);
HDC srcdc = CreateCompatibleDC(dc);
SelectObject(srcdc, rawbmp);
BitBlt(dc, marg/2, marg/2, srcw, srch, srcdc, 0, 0, SRCCOPY);
SelectObject(srcdc, adjbmp);
BitBlt(dc, marg/2, (h+marg)/2, srcw, srch, srcdc, 0, 0, SRCCOPY);
EndPaint(hwndDlg, &ps);
ReleaseDC(hwndDlg, srcdc);
}
return 0;
case WM_COMMAND:
if (LOWORD(wParam) == IDCANCEL)
{
EndDialog(hwndDlg, 0);
}
return 0;
}
return 0;
}
int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd)
{
DialogBox(hInstance, MAKEINTRESOURCE(IDD_DIALOG), GetDesktopWindow(), wndproc);
return 0;
}

Related

how to use gif_lib encode RGB to gif

If I run RGB2GIF like:
RGB2GIF(true, 1, "../tests/porsche.rgb", ExpNumOfColors, 320, 200);
the output is like original image
but when I use another rgb file:
RGB2GIF(true, 1, "D:\\rgb.yuv", ExpNumOfColors, 1280, 720);`
the out gif file missing colors.
Is it relative to global color map? local color map?
static void SaveGif(GifByteType *OutputBuffer, int Width, int Height, int ExpColorMapSize, ColorMapObject *OutputColorMap){int i, Error;
GifFileType *GifFile;
GifByteType *Ptr = OutputBuffer;
/* Open stdout for the output file: */
gfile = fopen("D:\\out_my.gif", "wb+");
if ((GifFile = EGifOpen(gfile, writeGifData, &Error)) == NULL) {
PrintGifError(Error);
exit(EXIT_FAILURE);
}
if (EGifPutScreenDesc(GifFile,
Width, Height, ExpColorMapSize, 0,
OutputColorMap) == GIF_ERROR ||
EGifPutImageDesc(GifFile,
0, 0, Width, Height, false, OutputColorMap) ==
GIF_ERROR){
PrintGifError(Error);
printf("error\n");
exit(EXIT_FAILURE);
}
GifQprintf("\n%s: Image 1 at (%d, %d) [%dx%d]: ",
PROGRAM_NAME, GifFile->Image.Left, GifFile->Image.Top,
GifFile->Image.Width, GifFile->Image.Height);
for (i = 0; i < Height; i++) {
if (EGifPutLine(GifFile, Ptr, Width) == GIF_ERROR)
exit(EXIT_FAILURE);
GifQprintf("\b\b\b\b%-4d", Height - i - 1);
Ptr += Width;
}
printf("\n");
printf("close file\n");
if (EGifCloseFile(GifFile, &Error) == GIF_ERROR)
PrintGifError(Error);
fclose(gfile);
exit(EXIT_FAILURE);
}
static void RGB2GIF(bool OneFileFlag, int NumFiles, char *FileName, int
ExpNumOfColors, int Width, int Height)
{
int ColorMapSize;
GifByteType *RedBuffer = NULL, *GreenBuffer = NULL, *BlueBuffer = NULL,
*OutputBuffer = NULL;
ColorMapObject *OutputColorMap = NULL;
ColorMapSize = 1 << ExpNumOfColors;
if (NumFiles == 1) {
LoadRGB(FileName, OneFileFlag,
&RedBuffer, &GreenBuffer, &BlueBuffer, Width, Height);
}
else {
LoadRGB(NULL, OneFileFlag,
&RedBuffer, &GreenBuffer, &BlueBuffer, Width, Height);
}
if ((OutputColorMap = GifMakeMapObject(ColorMapSize, NULL)) == NULL ||
(OutputBuffer = (GifByteType *)malloc(Width * Height *
sizeof(GifByteType))) == NULL)
GIF_EXIT("Failed to allocate memory required, aborted.");
printf("GifQuantizeBuffer\n");
if (GifQuantizeBuffer(Width, Height, &ColorMapSize,
RedBuffer, GreenBuffer, BlueBuffer,
OutputBuffer, OutputColorMap->Colors) == GIF_ERROR)
exit(EXIT_FAILURE);
free((char *)RedBuffer);
free((char *)GreenBuffer);
free((char *)BlueBuffer);
SaveGif(OutputBuffer, Width, Height, ExpNumOfColors, OutputColorMap);
}
nobody use this lib encode gif? is there some other libs to encode gif, except gif.h.

No NVENC capable devices found

I met a weird question.I have been using FFmpeg's NVENC to encode video .It is strange that I can use h264_nvenc smoothly without problem,but when I replace h264_nvenc with hevc_nvenc,I got the problem "No NVENC capable devices found".The FFmpeg version I am using is 3.2,and I use command line to encode with hevc_nvenc,it works ok.My code is here:
#include "stdafx.h"
int flush_encoder(AVFormatContext *fmt_ctx, unsigned int stream_index)
{
int ret;
int got_frame;
AVPacket enc_pkt;
if (!(fmt_ctx->streams[stream_index]->codec->codec->capabilities &
CODEC_CAP_DELAY))
return 0;
while (1) {
printf("Flushing stream #%u encoder\n", stream_index);
//ret = encode_write_frame(NULL, stream_index, &got_frame);
enc_pkt.data = NULL;
enc_pkt.size = 0;
av_init_packet(&enc_pkt);
ret = avcodec_encode_video2(fmt_ctx->streams[stream_index]->codec, &enc_pkt,
NULL, &got_frame);
av_frame_free(NULL);
if (ret < 0)
break;
if (!got_frame){
ret = 0;
break;
}
printf("Succeed to encode 1 frame! 编码成功1帧!\n");
/* mux encoded frame */
ret = av_write_frame(fmt_ctx, &enc_pkt);
if (ret < 0)
break;
}
return ret;
}
int main(int argc, char* argv[])
{
AVFormatContext* pFormatCtx;
AVOutputFormat* fmt;
AVStream* video_st;
AVCodecContext* pCodecCtx;
AVCodec* pCodec;
uint8_t* picture_buf;
AVFrame* picture;
int size;
FILE *in_file = fopen("test_yuv420p_320x180.yuv", "rb"); //Input YUV data 视频YUV源文件
int in_w = 320, in_h = 180;//宽高
int framenum = 100;
const char* out_file = "ds.hevc";
av_register_all();
//Method1 方法1.组合使用几个函数
pFormatCtx = avformat_alloc_context();
//Guess Format 猜格式
fmt = av_guess_format(NULL, out_file, NULL);
pFormatCtx->oformat = fmt;
//Method 2 方法2.更加自动化一些
//avformat_alloc_output_context2(&pFormatCtx, NULL, NULL, out_file);
//fmt = pFormatCtx->oformat;
//Output Format 注意输出路径
if (avio_open(&pFormatCtx->pb, out_file, AVIO_FLAG_READ_WRITE) < 0)
{
printf("Failed to open output file! 输出文件打开失败");
return -1;
}
video_st = avformat_new_stream(pFormatCtx, 0);
video_st->time_base.num = 1;
video_st->time_base.den = 25;
if (video_st == NULL)
{
return -1;
}
//Param that must set
pCodecCtx = video_st->codec;
pCodecCtx->codec_id =AV_CODEC_ID_HEVC;
//pCodecCtx->codec_id = fmt->video_codec;
pCodecCtx->codec_type = AVMEDIA_TYPE_VIDEO;
pCodecCtx->pix_fmt = AV_PIX_FMT_YUV420P;
pCodecCtx->width = in_w;
pCodecCtx->height = in_h;
pCodecCtx->time_base.num = 1;
pCodecCtx->time_base.den = 25;
pCodecCtx->bit_rate = 400000;
pCodecCtx->gop_size = 12;
//H264
//pCodecCtx->me_range = 16;
//pCodecCtx->max_qdiff = 4;
//pCodecCtx->qcompress = 0.6;
pCodecCtx->qmin = 10;
pCodecCtx->qmax = 51;
//Optional Param
pCodecCtx->max_b_frames = 3;
// Set Option
AVDictionary *param = 0;
//H.264
if (pCodecCtx->codec_id == AV_CODEC_ID_H264) {
av_dict_set(&param, "preset", "slow", 0);
av_dict_set(&param, "tune", "zerolatency", 0);
}
//H.265
if (pCodecCtx->codec_id == AV_CODEC_ID_H265){
av_dict_set(&param, "x265-params", "qp=20", 0);
av_dict_set(&param, "preset", "default", 0);
av_dict_set(&param, "tune", "zero-latency", 0);
}
//Dump Information 输出格式信息
av_dump_format(pFormatCtx, 0, out_file, 1);
//pCodec = avcodec_find_encoder(pCodecCtx->codec_id);
pCodec = avcodec_find_encoder_by_name("hevc_nvenc");
if (!pCodec){
printf("Can not find encoder! 没有找到合适的编码器!\n");
return -1;
}
if (avcodec_open2(pCodecCtx, pCodec, &param) < 0){
printf("Failed to open encoder! 编码器打开失败!\n");
return -1;
}
picture = av_frame_alloc();
size = avpicture_get_size(pCodecCtx->pix_fmt, pCodecCtx->width, pCodecCtx->height);
picture_buf = (uint8_t *)av_malloc(size);
avpicture_fill((AVPicture *)picture, picture_buf, pCodecCtx->pix_fmt, pCodecCtx->width, pCodecCtx->height);
//Write File Header 写文件头
avformat_write_header(pFormatCtx, NULL);
AVPacket pkt;
int y_size = pCodecCtx->width * pCodecCtx->height;
av_new_packet(&pkt, y_size * 3);
for (int i = 0; i<framenum; i++){
//Read YUV 读入YUV
if (fread(picture_buf, 1, y_size * 3 / 2, in_file) < 0){
printf("Failed to read YUV data! 文件读取错误\n");
return -1;
}
else if (feof(in_file)){
break;
}
picture->data[0] = picture_buf; // 亮度Y
picture->data[1] = picture_buf + y_size; // U
picture->data[2] = picture_buf + y_size * 5 / 4; // V
//PTS
picture->pts = i;
picture->format = pCodecCtx->pix_fmt;
picture->width = in_w;
picture->height = in_h;
int got_picture = 0;
//Encode 编码
int ret = avcodec_encode_video2(pCodecCtx, &pkt, picture, &got_picture);
if (ret < 0){
printf("Failed to encode! 编码错误!\n");
return -1;
}
if (got_picture == 1){
printf("Succeed to encode 1 frame! 编码成功1帧!\n");
pkt.stream_index = video_st->index;
ret = av_write_frame(pFormatCtx, &pkt);
av_free_packet(&pkt);
}
}
//Flush Encoder
int ret = flush_encoder(pFormatCtx, 0);
if (ret < 0) {
printf("Flushing encoder failed\n");
return -1;
}
//Write file trailer 写文件尾
av_write_trailer(pFormatCtx);
//Clean 清理
if (video_st){
avcodec_close(video_st->codec);
av_free(picture);
av_free(picture_buf);
}
avio_close(pFormatCtx->pb);
avformat_free_context(pFormatCtx);
fclose(in_file);
system("pause");
return 0;
}
Help!!!!
after a few days of strugglling,once again I try anwser the question myself.The key point is ,when encoding with hevc_nvenc,you must set pCodecCtx->max_b_frames = 0;(at least for version 3.2 of ffmpeg).

Cuda idx doesnt index matrices correctly

I have the following kernel in cuda:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
//for(j=0;j<N;j++){
// outgoing[j].p_t1=ingoing[j].p_t1;
//}
outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
This doesnt work. The following works:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
What is wrong? Why idx doesnt index the matrices correctly?
The whole code is written below. It wouldn't be so easy to understand it. The thing is that when I print the outgoing[idx].p_t1 fields at the end of the main function they print 0s when I do
outgoing[idx].p_t1=ingoing[idx].p_t1;
but they are correct when I do
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
Whats wrong?
/******************** Includes - Defines ****************/
#include "pagerank_serial.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <fcntl.h>
#include <cuda.h>
#include "string.h"
/******************** Defines ****************/
// Number of nodes
int N;
// Convergence threashold and algorithm's parameter d
double threshold, d;
// Table of node's data
Node *Nodes;
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
/***** Read graph connections from txt file *****/
void Read_from_txt_file(char* filename)
{
FILE *fid;
int from_idx, to_idx;
int temp_size;
fid = fopen(filename, "r");
if (fid == NULL){
printf("Error opening data file\n");
}
while (!feof(fid))
{
if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx))
{
Nodes[from_idx].con_size++;
temp_size = Nodes[from_idx].con_size;
//Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int));
Nodes[from_idx].To_id[temp_size - 1] = to_idx;
}
}
//printf("End of connections insertion!\n");
fclose(fid);
}
/***** Read P vector from txt file*****/
void Read_P_from_txt_file()
{
FILE *fid;
double temp_P;
int index = 0;
fid = fopen("P.txt", "r");
if (fid == NULL){printf("Error opening the Probabilities file\n");}
while (!feof(fid))
{
// P's values are double!
if (fscanf(fid," double sum = 0;%lf\n", &temp_P))
{
Nodes[index].p_t1 = temp_P;
index++;
}
}
//printf("End of P insertion!");
fclose(fid);
}
/***** Read E vector from txt file*****/
void Read_E_from_txt_file()
{
FILE *fid;
double temp_E;
int index = 0;
fid = fopen("E.txt", "r");
if (fid == NULL)
printf("Error opening the E file\n");
while (!feof(fid))
{
// E's values are double!
if (fscanf(fid,"%lf\n", &temp_E))
{
Nodes[index].e = temp_E;
index++;
}
}
//printf("End of E insertion!");
fclose(fid);
}
/***** Create P and E with equal probability *****/
void Random_P_E()
{
int i;
// Sum of P (it must be =1)
double sum_P_1 = 0;
// Sum of E (it must be =1)
double sum_E_1 = 0;
// Arrays initialization
for (i = 0; i < N; i++)
{
Nodes[i].p_t0 = 0;
Nodes[i].p_t1 = 1;
Nodes[i].p_t1 = (double) Nodes[i].p_t1 / N;
sum_P_1 = sum_P_1 + Nodes[i].p_t1;
Nodes[i].e = 1;
Nodes[i].e = (double) Nodes[i].e / N;
sum_E_1 = sum_E_1 + Nodes[i].e;
}
// Assert sum of probabilities is =1
// Print sum of P (it must be =1)
//printf("Sum of P = %f\n",sum_P_1);
// Exit if sum of P is !=1
assert(sum_P_1 = 1);
//printf("\n");
// Print sum of E (it must be =1)
//printf("Sum of E = %f\n",sum_E_1);
// Exit if sum of Pt0 is !=1
assert(sum_E_1 = 1);
}
/***** Main function *****/
int main(int argc, char** argv)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
// Check input arguments
if (argc < 5)
{
printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n");
return 0;
}
// get arguments
char filename[256];
strcpy(filename, argv[1]);
N = atoi(argv[2]);
threshold = atof(argv[3]);
d = atof(argv[4]);
int i;
// a constant value contributed of all nodes with connectivity = 0
// it's going to be addes to all node's new probability
// Allocate memory for N nodes
Nodes = (Node*) malloc(N * sizeof(Node));
for (i = 0; i < N; i++)
{
Nodes[i].con_size = 0;
//Nodes[i].To_id = (int*) malloc(sizeof(int));
}
Read_from_txt_file(filename);
// set random probabilities
Random_P_E();
Node *h_ingoing;
Node *h_outgoing;
h_ingoing = Nodes;
h_outgoing = (Node *)calloc(N, sizeof *h_outgoing);
Node *d_ingoing;
Node *d_outgoing;
cudaMalloc(&d_ingoing, N * sizeof *d_ingoing);
cudaMalloc(&d_outgoing, N * sizeof *d_outgoing);
cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice);
cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice);
float time;
cudaEvent_t begin, end;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize);
cudaEventCreate(&begin);
cudaEventCreate(&end);
cudaEventRecord(begin, 0);
pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d);
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
cudaEventElapsedTime(&time, begin, end);
cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost);
printf("%f\n", time) ;
printf("\n");
// Print final probabilitities
for (i = 0; i <100; i++)
{
printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1);
}
printf("\n");
printf("End of program!\n");
return (EXIT_SUCCESS);
}
When you say main function they print 0s when I do, I assume you are refering to all entries and not just index 0. Indeed, index 0 is not processed by your code with the fisrt version as ((idx > 0) && (idx < N)) is false for idx=0.
Getting further, in your code, we are missing the definition of the Node type. which is mandatory to get a better understanding of what could go wrong in your code.
Depending on the size of Node, its contents, and the structure packing you are using in compilation, it might be that Node size on host side differs from Node size on device. Using printf to verify that would be usefull, or using a debugger.
Also, you do not seem to be checking for error in launch. You definitely want to add a cudaPeekAtLastError and a cudaDeviceSynchronize after your kernel call to make sure no error occurred. (any other method call from cuda Runtime API may also return errors your code does not check).
EDIT
Trying to reproduce, I wrote the following, as close as possible to your code. I don't have a card with sufficient memory, hence the smaller node count.
typedef struct
{
double p_t0;
double p_t1;
double e;
int To_id[460];
int con_size;
} Node ;
__global__ void pagerank(Node* ingoing, Node* outgoing, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x ;
if ((idx > 0) && (idx < N))
outgoing[idx].p_t1 = ingoing[idx].p_t1;
}
#include <cstdlib>
#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR # %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } }
int main()
{
// int N = 916428 ; // does not fit on my GPU
int N = 400000 ;
int blockSize;
int minGridSize;
int gridSize;
Node* Nodes = (Node*)malloc(N * sizeof (Node)) ;
for (int i = 0 ; i < N ; ++i)
Nodes[i].p_t1 = (double)i+1;
Node* h_ingoing = Nodes;
Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ;
Node* d_ingoing ;
Node* d_outgoing ;
cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing));
cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing));
cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice));
cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice));
float time;
cudaEvent_t begin, end ;
//blockSize = 256 ;
cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ;
gridSize = (N + blockSize -1) / blockSize ;
printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ;
cudaCheck (cudaEventCreate (&begin)) ;
cudaCheck (cudaEventCreate (&end)) ;
cudaCheck (cudaEventRecord (begin, 0)) ;
pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ;
cudaCheck (cudaEventRecord (end, 0)) ;
cudaCheck (cudaEventSynchronize (end)) ;
cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ;
for (int i = 0 ; i < 100 ; ++i)
{
printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ;
}
for (int i = 0 ; i < N ; ++i)
{
if (h_outgoing[i].p_t1 != (double)(i+1))
printf ("Error # %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1));
}
return 0 ;
}
Except at index 0 for which the first draft of answer stated there was an issue, each output is correct.

Cuda Exceptions

I am doing something in CUDA (FFT), but I have no idea why it is generating exceptions when calling the kernel function.
All includes and definitions:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define CPU_ARRAY_SIZE 1024 // 1024, 2048, 4096 8192
#define GPU_ARRAY_SIZE 512 //
#define THREAD_SIZE 16 // fixed
#define BLOCK_SIZE (GPU_ARRAY_SIZE/THREAD_SIZE) // 32
#define PI 3.14
As I am running it in a NVIDIA GTX480, I thought it could be the shared memory space, although it doesn't seem to be (as there are "some many" shared variables). So, I aws changing the GPU_ARRAY_SIZE to see how it works, and it was giving me different results when I define it as 32, 64, 256, 512 (in the 512 case, it returns ALL zeros, which I guess CUDA couldn't make anything - in other cases, it returns weird, as I don't know the reason why it jumps 16 cells without any calculation). In most cases, in the Output window of my Microsoft Visual Studio, it returns billions of exceptions of the style "First-chance exception at 0x75b9b9bc in .exe: Microsoft C++ exception: cudaError_enum at memory location ". Before you ask me to debug, I cannot debug it, as the VS doesn't do that for files that are not recognized by VS (like .cpp - at least this theory works in my case).
Do you guys have any idea for the questions:
1. why is it generating exceptions?
2. why is it calculating, what it should do for every cell in every block, just within few cells
How could I solve this problem... any idea?
Kernel function:
__global__ void twiddle_factor(double *d_isub_matrix, double *d_osub_matrix)
{
__shared__ double block[THREAD_SIZE][THREAD_SIZE];
__shared__ double spectrum[THREAD_SIZE][THREAD_SIZE];
__shared__ double sum_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double sum_sin[THREAD_SIZE][THREAD_SIZE];
__shared__ double local_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double local_sin[THREAD_SIZE][THREAD_SIZE];
unsigned int xIndex = threadIdx.x + blockIdx.x* blockDim.x;
unsigned int yIndex = threadIdx.y + blockIdx.y* blockDim.y;
int u;
int x=0,y=0;
int tx = threadIdx.x;
int ty = threadIdx.y;
double sum_sines=0.0,sum_cosines=0.0;
double angle=(2*PI)/GPU_ARRAY_SIZE;
block[tx][ty] = d_isub_matrix[yIndex*GPU_ARRAY_SIZE+xIndex];
__syncthreads();
//for every column!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one row is activate. The thread in row adds all element of its column. */
if (ty == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
/* Access each column to add all elements of the column.*/
for (y=0; y<THREAD_SIZE; y++)
{
sum_sines += local_sin[tx][y];
sum_cosines += local_cos[tx][y];
}
//if (sum_sines < 0)
//sum_sin[u][tx] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[u][tx] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[u][tx] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[u][tx] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)
+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
block[tx][ty] = spectrum[tx][ty];
__syncthreads();
//for every row!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one column is activate. The thread in colum adds all element of its row. */
if (tx == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
for (x=0; x<THREAD_SIZE; x++)
{
sum_sines += local_sin[x][ty];
sum_cosines += local_cos[x][ty];
}
//if (sum_sines < 0)
//sum_sin[ty][u] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[ty][u] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[ty][u] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[ty][u] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
/* Transpose! I think this is not necessary part. */
d_osub_matrix[xIndex*GPU_ARRAY_SIZE + yIndex] = spectrum[threadIdx.y][threadIdx.x];
__syncthreads();
}
The main function:
int main(int argc, char** argv)
{
int i,j, w, h, sw, sh;
int numSubblock = CPU_ARRAY_SIZE / GPU_ARRAY_SIZE;
double *d_isub_matrix,*d_osub_matrix;
double *big_matrix = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *big_matrix2 = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *isub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
double *osub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
cudaEvent_t start,stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (i=0; i<CPU_ARRAY_SIZE; i++)
{
for (j=0; j<CPU_ARRAY_SIZE; j++)
big_matrix[i*CPU_ARRAY_SIZE + j] = rand();//i*CPU_ARRAY_SIZE + j;
}
cudaEventRecord(start,0);
//cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
//cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
for(i = 0; i < numSubblock; i++)
{
for (j=0; j < numSubblock; j++)
{
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
h = i*GPU_ARRAY_SIZE;
w = j*GPU_ARRAY_SIZE;
//printf("h = %d, w=%d",h,w);
//system("PAUSE");
// move subarea of big array into isub array.
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
isub_matrix[sh*GPU_ARRAY_SIZE+sw] = big_matrix[(h+sh)*CPU_ARRAY_SIZE + (w+sw)];
}
}
cudaMemcpy(d_isub_matrix,isub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);
//call the cuda kernel
dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
dim3 threads(THREAD_SIZE, THREAD_SIZE);
twiddle_factor<<<blocks, threads>>>(d_isub_matrix,d_osub_matrix);
cudaMemcpy(osub_matrix,d_osub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
big_matrix2[(h+sh)*CPU_ARRAY_SIZE + (w+sw)] = osub_matrix[sh*GPU_ARRAY_SIZE+sw];
printf(" sh %d sw %d %lf \n", sh, sw, osub_matrix[sh*GPU_ARRAY_SIZE+sw]);
}
}
printf("passei por aqui algumas vezes\n");
cudaFree(d_osub_matrix);
cudaFree(d_isub_matrix);
}
}
// cudaFree(d_osub_matrix);
// cudaFree(d_isub_matrix);
//Stop the time
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime,start,stop);
//showing the processing time
printf("The processing time took... %fms to execute everything",elapsedtime);
system("PAUSE");
for (sh = 0; sh < CPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <CPU_ARRAY_SIZE; sw++)
{
printf(" sh %d sw %d %lf \n", sh, sw, big_matrix2[sh*CPU_ARRAY_SIZE+sw]);
}
}
system("PAUSE");
// I guess the result is "[1][0] = [1], [1][512] = [513], [513][0] = [524289], [513][512] = [524801]".
}
By a short look the problem could and should be the folling lines:
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
You are allocating just to few memory for your double values on the GPU. Your sub matrix is allocated with 4 byte per point where 8 byte are needed.

Errors in Polynomial fitting problem on CUDA

I tried to use CUDA to do some simple loops on device, but it seem that it is hard to understand Cuda. I am getting 0 from every function call, when I use CUDA kernel function with normal C code.
The original code:
double evaluate(int D, double tmp[], long *nfeval)
{
/* polynomial fitting problem */
int i, j;
int const M=60;
double px, x=-1, dx=(double)M, result=0;
(*nfeval)++;
dx = 2/dx;
for (i=0;i<=M;i++)
{
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1) result+=(1-px)*(1-px);
x+=dx;
}
px = tmp[0];
for (j=1;j<D;j++) px=1.2*px+tmp[j];
px = px-72.661;
if (px<0) result+=px*px;
px = tmp[0];
for (j=1;j<D;j++) px=-1.2*px+tmp[j];
px =px-72.661;
if (px<0) result+=px*px;
return result;
}
I wanted to do first for loop on CUDA:
double evaluate_gpu(int D, double tmp[], long *nfeval)
{
/* polynomial fitting problem */
int j;
int const M=60;
double px, dx=(double)M, result=0;
(*nfeval)++;
dx = 2/dx;
int N = M;
double *device_tmp = NULL;
size_t size_tmp = sizeof tmp;
cudaMalloc((double **) &device_tmp, size_tmp);
cudaMemcpy(device_tmp, tmp, size_tmp, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cEvaluate <<< n_blocks, block_size >>> (device_tmp, result, D);
// cudaMemcpy(result, result, size_result, cudaMemcpyDeviceToHost);
px = tmp[0];
for (j=1;j<D;j++) px=1.2*px+tmp[j];
px = px-72.661;
if (px<0) result+=px*px;
px = tmp[0];
for (j=1;j<D;j++) px=-1.2*px+tmp[j];
px =px-72.661;
if (px<0) result+=px*px;
return result;
}
Where the device function looks like:
__global__ void cEvaluate_temp(double* tmp,double result, int D)
{
int M =60;
double px;
double x=-1;
double dx=(double)M ;
int j;
dx = 2/dx;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 60) //<==>if (idx < M)
{
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1)
{ __syncthreads();
result+=(1-px)*(1-px); //+=
}
x+=dx;
}
}
I know that I have not specified the problem, but it seem that I have much more than one.
I do not know when to copy variable to device, and when it will be copied 'automatically'.
Now, I am using CUDA 3.2 and there is problem with emulation (I would like to use printf),
when I run NVCC with make emu=1 , there is no error when I use printf, but I also do not get any output.
There is the simplest version of device function, I tested. Can anybody explain what will happen with result value after incrementing it in parallel ? I think I should use device shared memory and synchronization to do sth like "+=" .
__global__ void cEvaluate(double* tmp,double result, int D)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 60) //<==>if (idx < M)
{
result+=1;
printf("res = %f ",result); //-deviceemu, make emu=1
}
}
No, the variable result is not shared across multiple threads.
What I would suggest is to have a matrix of result values in shared memory, one result for each thread, compute every value and the reduce it to a single value.
__global__ void cEvaluate_temp(double* tmp,double *global_result, int D)
{
int M =60;
double px;
double x=-1;
double dx=(double)M ;
int j;
dx = 2/dx;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ shared_result [blocksize];
if (idx >= 60) return;
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1)
{
result[threadIdx] +=(1-px)*(1-px);
}
x+=dx;
}
__syncthreads();
if( threadIdx.x == 0) {
total_result = 0.
for (idx in blocksize){
total_result += result[idx];
}
global_result[0] = total_result;
}
Also you need the cudaMemcpy after the kernel invocation. Kernel are asynchronous and needs a sync function.
Also use the error check functions at each CUDA API invocation.