I starting to implement custum video decoder that utilize cuda HW decoder to generate YUV frame for next to encode it.
How can I fill "CUVIDPICPARAMS" struc ???
Is it possible?
My algorithm are:
For get video stream packet I'm use ffmpeg-dev libs avcodec, avformat...
My steps:
1) Open input file:
avformat_open_input(&ff_formatContext,in_filename,nullptr,nullptr);
2) Get video stream property's:
avformat_find_stream_info(ff_formatContext,nullptr);
3) Get video stream:
ff_video_stream=ff_formatContext->streams[i];
4) Get CUDA device and init it:
cuDeviceGet(&cu_device,0);
CUcontext cu_vid_ctx;
5) Init video CUDA decoder and set create params:
CUVIDDECODECREATEINFO *cu_decoder_info=new CUVIDDECODECREATEINFO;
memset(cu_decoder_info,0,sizeof(CUVIDDECODECREATEINFO));
...
cuvidCreateDecoder(cu_video_decoder,cu_decoder_info);
6)Read frame data to AVpacket
av_read_frame(ff_formatContext,ff_packet);
AND NOW I NEED decode frame packet on CUDA video decoder, in theoretical are:
cuvidDecodePicture(pDecoder,&picParams);
BUT before I need fill CUVIDPICPARAMS
CUVIDPICPARAMS picParams;//=new CUVIDPICPARAMS;
memset(&picParams, 0, sizeof(CUVIDPICPARAMS));
HOW CAN I FILL "CUVIDPICPARAMS" struc ???
typedef struct _CUVIDPICPARAMS
{
int PicWidthInMbs; // Coded Frame Size
int FrameHeightInMbs; // Coded Frame Height
int CurrPicIdx; // Output index of the current picture
int field_pic_flag; // 0=frame picture, 1=field picture
int bottom_field_flag; // 0=top field, 1=bottom field (ignored if field_pic_flag=0)
int second_field; // Second field of a complementary field pair
// Bitstream data
unsigned int nBitstreamDataLen; // Number of bytes in bitstream data buffer
const unsigned char *pBitstreamData; // Ptr to bitstream data for this picture (slice-layer)
unsigned int nNumSlices; // Number of slices in this picture
const unsigned int *pSliceDataOffsets; // nNumSlices entries, contains offset of each slice within the bitstream data buffer
int ref_pic_flag; // This picture is a reference picture
int intra_pic_flag; // This picture is entirely intra coded
unsigned int Reserved[30]; // Reserved for future use
// Codec-specific data
union {
CUVIDMPEG2PICPARAMS mpeg2; // Also used for MPEG-1
CUVIDH264PICPARAMS h264;
CUVIDVC1PICPARAMS vc1;
CUVIDMPEG4PICPARAMS mpeg4;
CUVIDJPEGPICPARAMS jpeg;
unsigned int CodecReserved[1024];
} CodecSpecific;
} CUVIDPICPARAMS;
typedef struct _CUVIDH264PICPARAMS
{
// SPS
int log2_max_frame_num_minus4;
int pic_order_cnt_type;
int log2_max_pic_order_cnt_lsb_minus4;
int delta_pic_order_always_zero_flag;
int frame_mbs_only_flag;
int direct_8x8_inference_flag;
int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
unsigned char residual_colour_transform_flag;
unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported)
unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
unsigned char qpprime_y_zero_transform_bypass_flag;
// PPS
int entropy_coding_mode_flag;
int pic_order_present_flag;
int num_ref_idx_l0_active_minus1;
int num_ref_idx_l1_active_minus1;
int weighted_pred_flag;
int weighted_bipred_idc;
int pic_init_qp_minus26;
int deblocking_filter_control_present_flag;
int redundant_pic_cnt_present_flag;
int transform_8x8_mode_flag;
int MbaffFrameFlag;
int constrained_intra_pred_flag;
int chroma_qp_index_offset;
int second_chroma_qp_index_offset;
int ref_pic_flag;
int frame_num;
int CurrFieldOrderCnt[2];
// DPB
CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
// Quantization Matrices (raster-order)
unsigned char WeightScale4x4[6][16];
unsigned char WeightScale8x8[2][64];
// FMO/ASO
unsigned char fmo_aso_enable;
unsigned char num_slice_groups_minus1;
unsigned char slice_group_map_type;
signed char pic_init_qs_minus26;
unsigned int slice_group_change_rate_minus1;
union
{
unsigned long long slice_group_map_addr;
const unsigned char *pMb2SliceGroupMap;
} fmo;
unsigned int Reserved[12];
// SVC/MVC
union
{
CUVIDH264MVCEXT mvcext;
CUVIDH264SVCEXT svcext;
};
} CUVIDH264PICPARAMS;
This is the purpose of the CUvideoparser object. You feed it the data stream frame by frame through cuvidParseVideoData, and it calls you back with CUVIDPICPARAMS ready to pass to the decoder when it detects it has a complete frame ready.
All this and more is very well illustrated in the D3D9 decode sample, available here. I suggest studying it in detail because there's not much documentation for this API outside of it.
Related
The CUDA graph API exposes a function call for adding a "batch memory operations" node to a graph:
CUresult cuGraphAddBatchMemOpNode (
CUgraphNode* phGraphNode,
CUgraph hGraph,
const CUgraphNode* dependencies,
size_t numDependencies,
const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams
);
but the documentation for this API call does not explain what the flags field of ... is used for, and what one should set the flags to. So what value should I be passing?
A related API function is cuStreamBatchMemOp
CUresult cuStreamBatchMemOp (
CUstream stream,
unsigned int count,
CUstreamBatchMemOpParams* paramArray,
unsigned int flags
);
it essentially takes the fields of CUDA_BATCH_MEM_OP_NODE_PARAMS as its separate parameters. Its documentation says that flags is "reserved for future expansion; must be 0".
Basically what I want is an function works like hiloint2uint64(), just join two 32 bit integer and reinterpret the outcome as an uint64.
I cannot find any function in CUDA that can do this, anyhow, is there any ptx code that can do that kind of type casting?
You can define your own function like this:
__host__ __device__ unsigned long long int hiloint2uint64(int h, int l)
{
int combined[] = { h, l };
return *reinterpret_cast<unsigned long long int*>(combined);
}
Maybe a bit late by now, but probably the safest way to do this is to do it "manually" with bit-shifts and or:
uint32_t ui_h = h;
uint32_t ui_l = l;
return (uint64_t(h)<<32)|(uint64_t(l));
Note the other solution presented in the other answer isn't safe, because the array of ints might not be 8-byte aligned (and shifting some bits is faster than memory read/write, anyway)
Use uint2 (but define the temporary variable as 64-bit value: unsigned long long int) instead of arrays to be sure of alignment.
Be careful about the order of l and h.
__host__ __device__ __forceinline__ unsigned long long int hiloint2uint64(unsigned int h, unsigned int l)
{
unsigned long long int result;
uint2& src = *reinterpret_cast<uint2*>(&result);
src.x = l;
src.y = h;
return result;
}
The CUDA registers have a size of 32 bits anyway. In the best case the compiler won't need any extra code. In the worst case it has to reorder the registers by moving a 32-bit value.
Godbolt example https://godbolt.org/z/3r9WYK9e7 of how optimized it gets.
Let us assume that we have the following strings that we need to store in a CUDA array.
"hi there"
"this is"
"who is"
How do we declare a array on the GPU to do this. I tried using C++ strings but it does not work.
Probably the best way to do this is to use structure that is similar to common compressed sparse matrix formats. Store the character data packed into a single piece of linear memory, then use a separate integer array to store the starting indices, and perhaps a third array to store the string lengths. The storage overhead of the latter might be more efficient that storing a string termination byte for every entry in the data and trying to parse for the terminator inside the GPU code.
So you might have something like this:
struct gpuStringArray {
unsigned int * pos;
unsigned int * length; // could be a smaller type if strings are short
char4 * data; // 32 bit data type will improve memory throughput, could be 8 bit
}
Note I used a char4 type for the string data; the vector type will give better memory throughput, but it will mean strings need to be aligned/suitably padded to 4 byte boundaries. That may or may not be a problem depending on what a typical real string looks like in your application. Also, the type of the (optional) length parameter should probably be chosen to reflect the maximum admissible string length. If you have a lot of very short strings, it might be worth using an 8 or 16 bit unsigned type for the lengths to save memory.
A really simplistic code to compare strings stored this way in the style of strcmp might look something like this:
__device__ __host__
int cmp4(const char4 & c1, const char4 & c2)
{
int result;
result = c1.x - c2.x; if (result !=0) return result;
result = c1.y - c2.y; if (result !=0) return result;
result = c1.z - c2.z; if (result !=0) return result;
result = c1.w - c2.w; if (result !=0) return result;
return 0;
}
__device__ __host__
int strncmp4(const char4 * s1, const char4 * s2, const unsigned int nwords)
{
for(unsigned int i=0; i<nwords; i++) {
int result = cmp4(s1[i], s2[i]);
if (result != 0) return result;
}
return 0;
}
__global__
void tkernel(const struct gpuStringArray a, const gpuStringArray b, int * result)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
char4 * s1 = a.data + a.pos[idx];
char4 * s2 = b.data + b.pos[idx];
unsigned int slen = min(a.length[idx], b.length[idx]);
result[idx] = strncmp4(s1, s2, slen);
}
[disclaimer: never compiled, never tested, no warranty real or implied, use at your own risk]
There are some corner cases and assumptions in this which might catch you out depending on exactly what the real strings in your code look like, but I will leave those as an exercise to the reader to resolve. You should be able to adapt and expand this into whatever it is you are trying to do.
You have to use C-style character strings char *str. Searching for "CUDA string" on google would have given you this CUDA "Hello World" example as first hit: http://computer-graphics.se/hello-world-for-cuda.html
There you can see how to use char*-strings in CUDA. Be aware that standard C-functions like strcpy or strcmp are not available in CUDA!
If you want an array of strings, you just have to use char** (as in C/C++). As for strcmp and similar functions, it highly depends on what you want to do. CUDA is not really well suited for string operations, maybe it would help if you would provide a little more detail about what you want to do.
I have a C function that returns an unsigned char * that can either be a pointer to a byte array (binary data representing a File..etc) or a pointer to an array of characters. I'm currently using the SWIG %array_class that wraps all C functions that return an unsigned char pointer and creates a Java array utility (SampleArrayUtil.java) to handle the population and retrieval on the Java side.
My problem is that I also use wrap the unsigned char * using: %apply char * { unsigned char * }; so that I get an array of Strings on the Java side. I don't want to wrap the unsigned char * return value (using %apply char * { unsigned char * };) when I get binary data back, I want to just have the byte array on the Java side. I was thinking of creating another C function to handle the binary data, but I'm unsure how to wrap this new function as it will also return an unsigned char * (see getValueFromRowAsByteArray)
C Functions:
unsigned char * getValueFromRowAsStringArray(struct result_row *row, attribute_type type, int32_t *len)
unsigned char * getValueFromRowAsByteArray(struct result_row *row, attribute_type type, int32_t *len)
//*row* input param with data results, *type* input enum type for the data type being requested and *len* is an output param that contains the length of the data being returned.
SWIG Interface File for Wrapping C Function Returning unsigned char * (array of char):
%module Sample
%include "typemaps.i"
%include "stdint.i"
%include "arrays_java.i"
%include "carrays.i"
%array_class(unsigned char, SampleArrayUtil);
%{
#include "C_API.h"
%}
%apply char * { unsigned char * };
%include "C_API.h"
You can apply different type maps to the same types in different places in at least two ways.
Firstly you can change the active typemap with %apply or %clear, e.g.:
%module test
%include "stdint.i"
%apply intptr_t { unsigned char * };
unsigned char * test1();
%apply char * { unsigned char * };
unsigned char * test2();
%clear unsigned char *;
unsigned char * test3();
Gives three functions in Java with different return types, according to the active typemap.
Secondly you can also write more specific typemaps though, for example:
%apply long long { unsigned char * test4 };
%apply char * { unsigned char * test5 };
unsigned char * test4();
unsigned char * test5();
Only applies to test4 and test5 respectively - it matches on the type and the function name. In Java this results in:
public static long test4() {
return testJNI.test4();
}
public static String test5() {
return testJNI.test5();
}
For arguments you can match on the type and the parameter name in the function signature similarly.
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
// string s = "{\"age\":23,\"study\":{\"language\":{\"one\":\"chinese\",\"subject\":[{\"one\":\"china\"},{\"two\":\"Eglish\"}]}}}";
string s = "{\"age\" : 26,\"person\":[{\"id\":1,\"study\":[{\"language\":\"chinese\"},{\"language1\":\"chinese1\"}],\"name\":\"chen\"},{\"id\":2,\"name\":\"zhang\"}],\"name\" : \"huchao\"}";
ptree pt;
stringstream stream(s);
read_json<ptree>( stream, pt);
int s1=pt.get<int>("age");
cout<<s1<<endl;
string s2 = pt.get<string>("person."".study."".language1");
cout<<s2<<endl;
Now I want to get the value of language1.
First of all, I've got to ask why you have a list with such different elements in it? If language1 has some special meaning, then I would split the data up into study and study1 or something like that. In general, lists should be of a single type.
Assuming you can't change the format, here is the answer to your question. To the best of my knowledge, the only way to get something out of an array is to iterate over it.
#include <boost/foreach.hpp>
BOOST_FOREACH(const ptree::value_type& val, pt.get_child("person.study"))
{
boost::optional<string> language1Option = v.second.get_optional<string>("language1");
if(language1Option) {
cout<<"found language1: "<<*language1Option<<endl;
}
}
This code iterates over everything in the "study" list and looks for an entry with a "language1" key, printing the result