Converting between SDP's sprop-parameter-sets and mkv's CodecPrivate - h.264

Is there some easy way to convert between h264 settings as stored in Matroska file:
+ CodecPrivate, length 36 (h.264 profile: Baseline #L2.0) hexdump
01 42 c0 14 ff e1 00 15 67 42 c0 14 da 05 07 e8
40 00 00 03 00 40 00 00 0c 03 c5 0a a8 01 00 04
68 ce 0f c8
and the same settings when streaming that matroska file using RTSP?:
a=fmtp:96 packetization-mode=1;profile-level-id=42C014;sprop-parameter-sets=Z0LAFNoFB+hAAAADAEAAAAwDxQqo,aM4PyA==
Base-64 strings decodes to this:
00000000 67 42 c0 14 da 05 07 e8 40 00 00 03 00 40 00 00 |gB......#....#..|
00000010 0c 03 c5 0a a8
00000000 68 ce 0f c8 |h...|
which partially matches the data in mkv's CodecPrivate.

Extracted conversion from raw to CodecPrivate from ffmpeg:
/*
* AVC helper functions for muxers
* Copyright (c) 2006 Baptiste Coudurier <baptiste.coudurier#smartjog.com>
* Modified by _Vi: stand-alone version (without ffmpeg)
*
* This file is based on the code from FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <string.h>
#include <stdio.h>
#define assert(x) if(!(x)) { fprintf(stderr, "Assertion failed...\n"); return -1; }
#ifndef AV_RB24
# define AV_RB24(x) \
((((const uint8_t*)(x))[0] << 16) | \
(((const uint8_t*)(x))[1] << 8) | \
((const uint8_t*)(x))[2])
#endif
#ifndef AV_RB32
# define AV_RB32(x) \
(((uint32_t)((const uint8_t*)(x))[0] << 24) | \
(((const uint8_t*)(x))[1] << 16) | \
(((const uint8_t*)(x))[2] << 8) | \
((const uint8_t*)(x))[3])
#endif
#define avio_w8(pb, x) *(*pb)++ = x;
#define avio_wb16(pb, x) *(*pb)++ = ((x)>>8); *(*pb)++ = x&0xFF;
#define avio_wb32(pb, x) *(*pb)++ = ((x)>>24); \
*(*pb)++ = ((x)>>16)&0xFF; \
*(*pb)++ = ((x)>>8)&0xFF; \
*(*pb)++ = ((x)>>0)&0xFF;
#define avio_write(pb, b, l) memcpy((*pb), b, l); (*pb)+=(l);
typedef unsigned char uint8_t;
typedef int intptr_t;
typedef unsigned long uint32_t;
static const uint8_t *ff_avc_find_startcode_internal(const uint8_t *p, const uint8_t *end)
{
const uint8_t *a = p + 4 - ((intptr_t)p & 3);
for (end -= 3; p < a && p < end; p++) {
if (p[0] == 0 && p[1] == 0 && p[2] == 1)
return p;
}
for (end -= 3; p < end; p += 4) {
uint32_t x = *(const uint32_t*)p;
// if ((x - 0x01000100) & (~x) & 0x80008000) // little endian
// if ((x - 0x00010001) & (~x) & 0x00800080) // big endian
if ((x - 0x01010101) & (~x) & 0x80808080) { // generic
if (p[1] == 0) {
if (p[0] == 0 && p[2] == 1)
return p;
if (p[2] == 0 && p[3] == 1)
return p+1;
}
if (p[3] == 0) {
if (p[2] == 0 && p[4] == 1)
return p+2;
if (p[4] == 0 && p[5] == 1)
return p+3;
}
}
}
for (end += 3; p < end; p++) {
if (p[0] == 0 && p[1] == 0 && p[2] == 1)
return p;
}
return end + 3;
}
const uint8_t *ff_avc_find_startcode(const uint8_t *p, const uint8_t *end){
const uint8_t *out= ff_avc_find_startcode_internal(p, end);
if(p<out && out<end && !out[-1]) out--;
return out;
}
int ff_avc_parse_nal_units(unsigned char **pb, const uint8_t *buf_in, int size)
{
const uint8_t *p = buf_in;
const uint8_t *end = p + size;
const uint8_t *nal_start, *nal_end;
size = 0;
nal_start = ff_avc_find_startcode(p, end);
while (nal_start < end) {
while(!*(nal_start++));
nal_end = ff_avc_find_startcode(nal_start, end);
avio_wb32(pb, nal_end - nal_start);
avio_write(pb, nal_start, nal_end - nal_start);
size += 4 + nal_end - nal_start;
nal_start = nal_end;
}
return size;
}
int ff_avc_parse_nal_units_buf(const unsigned char *buf_in, unsigned char **buf, int *size)
{
unsigned char *pbptr = *buf;
ff_avc_parse_nal_units(&pbptr, buf_in, *size);
*size = pbptr - *buf;
return 0;
}
int my_isom_write_avcc(unsigned char **pb, const uint8_t *data, int len)
{
unsigned char tmpbuf[4000];
if (len > 6) {
/* check for h264 start code */
if (AV_RB32(data) == 0x00000001 ||
AV_RB24(data) == 0x000001) {
uint8_t *buf=tmpbuf, *end, *start;
uint32_t sps_size=0, pps_size=0;
uint8_t *sps=0, *pps=0;
int ret = ff_avc_parse_nal_units_buf(data, &buf, &len);
if (ret < 0)
return ret;
start = buf;
end = buf + len;
/* look for sps and pps */
while (buf < end) {
unsigned int size;
uint8_t nal_type;
size = AV_RB32(buf);
nal_type = buf[4] & 0x1f;
if (nal_type == 7) { /* SPS */
sps = buf + 4;
sps_size = size;
} else if (nal_type == 8) { /* PPS */
pps = buf + 4;
pps_size = size;
}
buf += size + 4;
}
assert(sps);
assert(pps);
avio_w8(pb, 1); /* version */
avio_w8(pb, sps[1]); /* profile */
avio_w8(pb, sps[2]); /* profile compat */
avio_w8(pb, sps[3]); /* level */
avio_w8(pb, 0xff); /* 6 bits reserved (111111) + 2 bits nal size length - 1 (11) */
avio_w8(pb, 0xe1); /* 3 bits reserved (111) + 5 bits number of sps (00001) */
avio_wb16(pb, sps_size);
avio_write(pb, sps, sps_size);
avio_w8(pb, 1); /* number of pps */
avio_wb16(pb, pps_size);
avio_write(pb, pps, pps_size);
} else {
avio_write(pb, data, len);
}
}
return 0;
}
#define H264PRIVATE_MAIN
#ifdef H264PRIVATE_MAIN
int main() {
unsigned char data[1000];
int len = fread(data, 1, 1000, stdin);
unsigned char output[1000];
unsigned char *output_f = output;
my_isom_write_avcc(&output_f, data, len);
fwrite(output, 1, output_f - output, stdout);
return 0;
}
#endif
Inserting "00 00 00 01" before each base-64-decoded block and feeding it into that program outputs CodecPrivate:
$ printf '\x00\x00\x00\x01'\
'\x67\x42\xc0\x14\xda\x05\x07\xe8\x40\x00\x00\x03\x00\x40\x00\x00\x0c\x03\xc5\x0a\xa8'\
'\x00\x00\x00\x01'\
'\x68\xce\x0f\xc8' | ./avc_to_mkvcodecpriv | hd
00000000 01 42 c0 14 ff e1 00 15 67 42 c0 14 da 05 07 e8 |.B......gB......|
00000010 40 00 00 03 00 40 00 00 0c 03 c5 0a a8 01 00 04 |#....#..........|
00000020 68 ce 0f c8 |h...|
00000024

Related

Decoding H264 Stream Always Returns MF_E_TRANSFORM_NEED_MORE_INPUT

I'm attempting to decode raw h264 from a network stream using the Media Foundation Transform CLSID_MSH264DecoderMFT. Setting up the transform seems to work and it's accepting data. However, no matter how much data I provide, it always returns MF_E_TRANSFORM_NEED_MORE_INPUT.
The document says, that the decoder will skip over all data until it finds valid Sequence and Picture Parameters. I'm providing this and then a raw data frame along with start codes:
1 00 00 00 01 67 42 c0 28 da 01 e0 19 fe 7c 05 a8 08 08 0a 00 00 03 00 02 00 00 03 00 61 1e 30 65
2 40 00 00 00 01 68 ce 3c 80 00 00 00 01 00 00 0e 6c 41 9a e0 eb 08 84 3c 14 ff fe 10 ff f8 64 14
3 f0 88 20 11 55 d5 7e 19 11 17 17 c5 c5 3f 05 00 a3 86 41 08 8a ae ab 58 8c 1f 11 88 cd f8 9f ff
4 f8 9d 78 21 f9 2a bf e2 3e 04 1f f8 20 08 92 7c 0e 33 52 67 e1 48 74 32 f8 5c 5f ca fd 77 12 df
5 3a 0f 93 11 89 2f 26 98 76 16 65 9b 78 87 77 ff ff fe 27 c6 fe b1 39 34 27 04 17 55 f0 61 fe 23
Above is only a partial sample, but it's representative of the data I provide to the transform.
Transform Setup:
ComPtr<IUnknown> pUnknown = nullptr;
HRESULT hResult = CoCreateInstance(CLSID_MSH264DecoderMFT, nullptr, CLSCTX_INPROC_SERVER, IID_IUnknown, &pUnknown);
if (S_OK != hResult) {
LogError("Failed to create H264 decoder");
return false;
}
hResult = pUnknown->QueryInterface(IID_PPV_ARGS(&mVideoDecoder));
if (hResult != S_OK) {
LogError("Failed to create H264 decoder");
return false;
}
ComPtr<IMFMediaType> pInputMediaType = nullptr;
hResult = MFCreateMediaType(&pInputMediaType);
if (S_OK != hResult) {
return false;
}
pInputMediaType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
pInputMediaType->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_H264);
std::shared_ptr<VideoMp4Track> videoTrack = mDemuxer->getVideoTrack();
uint32_t width = videoTrack->getWidth();
uint32_t height = videoTrack->getHeight();
MFSetAttributeSize(pInputMediaType.Get(), MF_MT_FRAME_SIZE, width, height);
MFSetAttributeRatio(pInputMediaType.Get(), MF_MT_PIXEL_ASPECT_RATIO, width, height);
MFSetAttributeRatio(pInputMediaType.Get(), MF_MT_FRAME_RATE, videoTrack->getFrameRate(), 1);
pInputMediaType->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_MixedInterlaceOrProgressive);
ComPtr<IMFAttributes> attributes;
mVideoDecoder->GetAttributes(&attributes);
hResult = attributes->SetUINT32(CODECAPI_AVLowLatencyMode, 1);
if (hResult != S_OK) {
LogError("Failed to set low latency mode. Video might be choppy.");
}
hResult = attributes->SetUINT32(CODECAPI_AVDecVideoAcceleration_H264, 1);
if (hResult != S_OK) {
LogError("Failed to set GPU acceleration. Video might be choppy.");
}
hResult = mVideoDecoder->SetInputType(0, pInputMediaType.Get(), 0);
if (hResult != S_OK) {
LogError("Failed to set input type for decoder");
return false;
}
ComPtr<IMFMediaType> pOutputType = nullptr;
hResult = MFCreateMediaType(&pOutputType);
if (S_OK != hResult) {
return false;
}
pOutputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
pOutputType->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_NV12);
MFSetAttributeSize(pOutputType.Get(), MF_MT_FRAME_SIZE, width, height);
MFSetAttributeRatio(pOutputType.Get(), MF_MT_PIXEL_ASPECT_RATIO, width, height);
MFSetAttributeRatio(pOutputType.Get(), MF_MT_FRAME_RATE, videoTrack->getFrameRate(), 1);
hResult = mVideoDecoder->SetOutputType(0, pOutputType.Get(), 0);
if (hResult != S_OK) {
LogError("Failed to set input type for decoder");
return false;
}
// Notify the resampler.
hResult = mVideoDecoder->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, NULL);
if (S_OK != hResult) {
LogError("Failed to send flush command to the decoder.");
return false;
}
hResult = mVideoDecoder->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, NULL);
if (S_OK != hResult) {
LogError("Failed to send notify command to the decoder.");
return false;
}
hResult = mVideoDecoder->ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, NULL);
if (S_OK != hResult) {
LogError("Failed to send notify command to the decoder.");
return false;
}
I have no idea why it isn't able to decode, would appreciate any help.
Thanks.
Edit:
DataPtr transformData = MakeDataPtr();
uint32_t startCode = 0x01000000;
std::shared_ptr<VideoMp4Track> video = mImpl->mDemuxer->getVideoTrack();
transformData->appendBytes(&startCode, 4);
DataPtr sps = video->getSequenceParameters();
transformData->appendData(*sps);
transformData->appendBytes(&startCode, 4);
DataPtr pps = video->getPictureParameters();
transformData->appendData(*pps);
transformData->appendBytes(&startCode, 4);
transformData->appendData(*sampleData);
transformData->appendBytes(&startCode, 4);
ComPtr<IMFSample> pSample = mImpl->createMFSample(transformData->getBytes(), transformData->getSize());
if (nullptr == pSample) {
LogError("Failed to create the buffer for decoder input");
return nullptr;
}
HRESULT hResult = mImpl->mVideoDecoder->ProcessInput(0, pSample.Get(), 0);
if (hResult != S_OK) {
if (hResult == MF_E_NOTACCEPTING) {
mImpl->mVideoDecoder->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, NULL);
hResult = mImpl->mVideoDecoder->ProcessInput(0, pSample.Get(), 0);
}
else {
LogError("Error feeding to resampler...");
return nullptr;
}
}
DWORD dwStatus = 0;
// outputDataBuffer is empty, need to create it.
MFT_OUTPUT_DATA_BUFFER outputDataBuffer{};
ComPtr<IMFSample> pVideoSample = nullptr;
hResult = MFCreateSample(&pVideoSample);
if (S_OK != hResult) {
LogError("Failed to create a media sample for decoder output");
return false;
}
ComPtr<IMFMediaBuffer> pOutputBuffer = nullptr;
hResult = MFCreateMemoryBuffer(sampleData->getSize(), &pOutputBuffer);
if (S_OK != hResult) {
LogError("Failed to create a memory buffer for decoder output");
return false;
}
pVideoSample->AddBuffer(pOutputBuffer.Get());
outputDataBuffer.pSample = pVideoSample.Get();
do {
hResult = mImpl->mVideoDecoder->ProcessOutput(0, 1, &outputDataBuffer, &dwStatus);
if (hResult == MF_E_TRANSFORM_NEED_MORE_INPUT) {
// conversion end
break;
}
I've omitted the rest because it never gets further, it just stays in this loop populating the transform.
Edit 2:
(Not) Working sample on github
https://github.com/pma07pg/h264
The sample code was too large to dump here so I've put the main.cpp on github. Should be able to just put it into a VS project and run it off the bat.
There are few bugs in your code.
1.) You didn't account for the start code size
yours:
const uint32_t parameterInputSize = sizeof(pictureParameters) + sizeof(sequenceParameters);
mine:
const uint32_t parameterInputSize = sizeof(startCode) + sizeof(pictureParameters) + sizeof(startCode) + sizeof(sequenceParameters);
Your 'mdat's contain more than one AccessUnit. Each AccessUnit is prefixed with its length which you have to replace with a start code.
Your 'mdat':
'mdat' = <size> data[0] | <size> data[1] | ... | <size> data[n] |
Replace the size with a start code and break the multiple Access Units into individual Access Units.
Required decoder input:
00 00 00 01 data[0]
00 00 00 01 data[1]
...
00 00 00 01 data[n]
See details here: https://github.com/go4shoe/MedieFoundationExample

Multiway stable partition

Is there a way to perform multiway (>2) stable partition in Thrust?
Either stable partition or stable partition copy both are equally interesting. Currently I can only use two-way stable partition copy for purposes described above. It is clear how to use it to partition a sequence into a three parts using two predicates and two calls of thrust::stable_partition_copy. But I am sure it is technically possible to implement multiway stable partition.
I can imagine the following multiway stable partition copy (pseudocode):
using F = float;
thrust::device_vector< F > trianges{N * 3};
// fill triangles here
thrust::device_vector< F > A{N}, B{N}, C{N};
auto vertices_begin = thrust::make_tuple(A.begin(), B.begin(), C.begin());
using U = unsigned int;
auto selector = [] __host__ __device__ (U i) -> U { return i % 3; };
thrust::multiway_stable_partition_copy(p, triangles.cbegin(), triangles.cend(), selector, vertices_begin);
A.begin(), B.begin(), C.begin() should be incremented individually.
Also, I can imagine hypothetical dispatch iterator, which would do the same (and would be more useful I think).
From my knowledge of the thrust internals, there is no readily adaptable algorithm to do what you envisage.
A simple approach would be to extend your theoretical two pass three way partition to M-1 passes using a smart binary predicate, something like
template<typename T>
struct divider
{
int pass;
__host__ __device__ divider(int p) : pass(p) { };
__host__ __device__ int classify(const T &val) { .... };
__host__ __device__ bool operator()(const T &val) { return !(classify(val) > pass); };
}
which enumerates a given input into M possible subsets and returns true if the input is in the Nth or less subset, and then a loop
auto start = input.begin();
for(int i=0; i<(M-1); ++i) {
divider pred<T>(i);
result[i] = thrust::stable_partition(
thrust::device,
start,
input.end(),
pred);
start = result[i];
}
[ note all code written in a browser on a tablet while floating on a boat in the Baltic. Obviously never compiled or run. ]
This will certainly be the most space efficient, as a maximum of len(input) temporary storage is required, whereas a hypothetical single pass implementation would require M * len(input) storage, which would quickly get impractical for a large M.
Edit to add that now I'm back on land with a compiler, this seems to work as expected:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
struct divider
{
int pass;
__host__ __device__
divider(int p) : pass(p) { };
__host__ __device__
int classify(const int &val) { return (val % 12); };
__host__ __device__
bool operator()(const int &val) { return !(classify(val) > pass); };
};
int main()
{
const int M = 12;
const int N = 120;
thrust::device_vector<int> input(N);
thrust::counting_iterator<int> iter(0);
thrust::copy(iter, iter+N, input.begin());
thrust::device_vector<int>::iterator result[M];
auto start = input.begin();
for(int i=0; i<(M-1); ++i) {
divider pred(i);
result[i] = thrust::stable_partition(
thrust::device,
start,
input.end(),
pred);
start = result[i];
}
int i = 0;
for(auto j=input.begin(); j!=input.end(); ++j) {
if (j == result[i]) {
i++;
std:: cout << std::endl;
}
std::cout << *j << " ";
}
return 0;
}
$ nvcc -std=c++11 -arch=sm_52 -o partition partition.cu
$ ./partition
0 12 24 36 48 60 72 84 96 108
1 13 25 37 49 61 73 85 97 109
2 14 26 38 50 62 74 86 98 110
3 15 27 39 51 63 75 87 99 111
4 16 28 40 52 64 76 88 100 112
5 17 29 41 53 65 77 89 101 113
6 18 30 42 54 66 78 90 102 114
7 19 31 43 55 67 79 91 103 115
8 20 32 44 56 68 80 92 104 116
9 21 33 45 57 69 81 93 105 117
10 22 34 46 58 70 82 94 106 118
11 23 35 47 59 71 83 95 107 119

adding 1 to a binary number using logical operations

As title describes; I want to add 1 to a 4 bit binary number using only AND OR XOR operations. How can I achieve that?
Regards
Think about what you're doing when you perform addition of decimal numbers in long-hand. It's exactly the same.
Here's how I'd do it, showing a lot of working.
Label the four bits from b0 (least significant bit) to b3 (most significant bit), and introduce 5 carry bits, c0 to c4. The modified values are b3', b2', b1', b0', so your nibble, the carry bits, and the modified values are:
{ b3 b2 b1 b0 }
{ c4 c3 c2 c1 c0 }
{ b3' b2' b1' b0' }
and they are related through:
c0 = 1 (this is to flip the least significant bit)
b0' = XOR(b0, 1)
c1 = AND(b0, 1)
b1' = XOR(b1, c0)
c2 = AND(b1, c0)
b2' = XOR(b2, c1)
c3 = AND(b2, c1)
b3' = XOR(b3, c2)
c4 = AND(b3, c2)
Note:
There's no need for OR to be used.
The choice of four bits is arbitrary - beyond the first bit, the logic is copy/pasta.
When the last carry bit c3 is 0, the number is silently overflowing (going from 15 to 0).
There's no need to have four carry bits, but in keeping with the hand-addition paradigm, I've introduced them anyway.
Four bits is a Nibble.
Sample C# class:
public class Nibble
{
const int bits = 4;
private bool[] _bools = new bool[bits];
public void Reset()
{
for ( int i = 0; i < _bools.Length; i++ )
_bools[i] = false;
}
public void Increment()
{
bool[] result = new bool[bits];
bool[] carries = new bool[bits + 1];
carries[0] = true;
for ( int i = 0; i < bits; i++ )
{
result[i] = _bools[i] ^ carries[i];
carries[i + 1] = _bools[i] && carries[i];
}
if ( carries[bits] )
Console.WriteLine("Overflow!");
_bools = result;
}
public byte Value
{
get
{
byte result = 0;
for ( int i = 0; i < bits; i++ )
{
if ( _bools[i] )
result += (byte)(1 << i);
}
return result;
}
}
}
Usage:
static class Program
{
static void Main()
{
var nibble = new Nibble();
for ( int i = 0; i < 17; i++ )
{
Console.WriteLine(nibble.Value);
nibble.Increment();
}
}
}
Run on Ideone here

scan-array CUDA

I'm trying to scan a simple array using CUDA but it seems there is something wrong with the code below..I am trying to find what i am doing wrong but i can't.Can anyone please help me?
#include <stdio.h>
#include <stdlib.h>
__global__ void prescan(int *g_odata, int *g_idata, int n){
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1){ // build sum in place up the tree
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2){ // traverse down tree & build scan
offset >>= 1;
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char *argv[]){
int i;
int *input = 0;
int *output = 0;
int *g_idata = 0;
int *g_odata = 0;
int numblocks = 1;
int radix = 16;
input = (int*)malloc(numblocks*radix*sizeof(int));
output = (int*)malloc(numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_idata, numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_odata, numblocks*radix*sizeof(int));
for(i=0; i<numblocks*radix; i++){
input[i] = 1 + 2*i;
}
for(i=0; i<numblocks*radix; i++){
printf("%d ", input[i]);
}
cudaMemcpy(g_idata, input, numblocks*radix*sizeof(int), cudaMemcpyHostToDevice);
prescan<<<1,8>>>(g_odata, g_idata, numblocks*radix);
cudaThreadSynchronize();
cudaMemcpy(output, g_odata, numblocks*radix*sizeof(int), cudaMemcpyDeviceToHost);
for(i=0; i<numblocks*radix; i++){
printf("%d ", output[i]);
}
free(input);
free(output);
cudaFree(g_idata);
cudaFree(g_odata);
return 0;
}
The output is this: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.I want to have this output: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 1 4 9 16 25 36 49 64 81 100 121 144 169 196 225
Just go through this code to implement scan in parallel environment.
The algorithm which I implemented here is Hillis Steele exclusive scan.I implemented algorithm through shared memory, it will definitely improve the execution time for the large data set.
#include<stdio.h>
#include<math.h>
__global__ void scan(int *d_in,int *d_out,int n)
{
extern __shared__ int sdata[];
int i;
int tid = threadIdx.x;
sdata[tid] = d_in[tid];
for (i = 1; i <n; i <<= 1)
{
if (tid>=i)
{
sdata[tid] +=sdata[tid-i];
}
__syncthreads();
}
d_out[tid] = sdata[tid];
__syncthreads();
}
int main()
{
int h_in[16],h_out[16];
int i,j;
for (i = 0; i < 16; i++)
h_in[i] = 2*i+1;
for (i = 0; i < 16; i++)
printf("%d ", h_in[i]);
int *d_in;
int *d_out;
cudaMalloc((void**)&d_in, sizeof(int)* 16);
cudaMalloc((void**)&d_out, sizeof(int)* 16);
cudaMemcpy(d_in, h_in, sizeof(int) * 16, cudaMemcpyHostToDevice);
scan <<<1, 16, sizeof(int)*16 >>>(d_in,d_out, 16);
cudaMemcpy(h_out, d_out, sizeof(int) * 16, cudaMemcpyDeviceToHost);
for (i = 0; i < 16; i++)
printf("%d ", h_out[i]);
return 0;
}

Code Golf: Collatz Conjecture

Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
Inspired by http://xkcd.com/710/ here is a code golf for it.
The Challenge
Given a positive integer greater than 0, print out the hailstone sequence for that number.
The Hailstone Sequence
See Wikipedia for more detail..
If the number is even, divide it by two.
If the number is odd, triple it and add one.
Repeat this with the number produced until it reaches 1. (if it continues after 1, it will go in an infinite loop of 1 -> 4 -> 2 -> 1...)
Sometimes code is the best way to explain, so here is some from Wikipedia
function collatz(n)
show n
if n > 1
if n is odd
call collatz(3n + 1)
else
call collatz(n / 2)
This code works, but I am adding on an extra challenge. The program must not be vulnerable to stack overflows. So it must either use iteration or tail recursion.
Also, bonus points for if it can calculate big numbers and the language does not already have it implemented. (or if you reimplement big number support using fixed-length integers)
Test case
Number: 21
Results: 21 -> 64 -> 32 -> 16 -> 8 -> 4 -> 2 -> 1
Number: 3
Results: 3 -> 10 -> 5 -> 16 -> 8 -> 4 -> 2 -> 1
Also, the code golf must include full user input and output.
x86 assembly, 1337 characters
;
; To assemble and link this program, just run:
;
; >> $ nasm -f elf collatz.asm && gcc -o collatz collatz.o
;
; You can then enjoy its output by passing a number to it on the command line:
;
; >> $ ./collatz 123
; >> 123 --> 370 --> 185 --> 556 --> 278 --> 139 --> 418 --> 209 --> 628 --> 314
; >> --> 157 --> 472 --> 236 --> 118 --> 59 --> 178 --> 89 --> 268 --> 134 --> 67
; >> --> 202 --> 101 --> 304 --> 152 --> 76 --> 38 --> 19 --> 58 --> 29 --> 88
; >> --> 44 --> 22 --> 11 --> 34 --> 17 --> 52 --> 26 --> 13 --> 40 --> 20 --> 10
; >> --> 5 --> 16 --> 8 --> 4 --> 2 --> 1
;
; There's even some error checking involved:
; >> $ ./collatz
; >> Usage: ./collatz NUMBER
;
section .text
global main
extern printf
extern atoi
main:
cmp dword [esp+0x04], 2
jne .usage
mov ebx, [esp+0x08]
push dword [ebx+0x04]
call atoi
add esp, 4
cmp eax, 0
je .usage
mov ebx, eax
push eax
push msg
.loop:
mov [esp+0x04], ebx
call printf
test ebx, 0x01
jz .even
.odd:
lea ebx, [1+ebx*2+ebx]
jmp .loop
.even:
shr ebx, 1
cmp ebx, 1
jne .loop
push ebx
push end
call printf
add esp, 16
xor eax, eax
ret
.usage:
mov ebx, [esp+0x08]
push dword [ebx+0x00]
push usage
call printf
add esp, 8
mov eax, 1
ret
msg db "%d --> ", 0
end db "%d", 10, 0
usage db "Usage: %s NUMBER", 10, 0
Befunge
&>:.:1-|
>3*^ #
|%2: <
v>2/>+
LOLCODE: 406 CHARAKTERZ
HAI
BTW COLLATZ SOUNDZ JUS LULZ
CAN HAS STDIO?
I HAS A NUMBAR
BTW, I WANTS UR NUMBAR
GIMMEH NUMBAR
VISIBLE NUMBAR
IM IN YR SEQUENZ
MOD OF NUMBAR AN 2
BOTH SAEM IT AN 0, O RLY?
YA RLY, NUMBAR R QUOSHUNT OF NUMBAR AN 2
NO WAI, NUMBAR R SUM OF PRODUKT OF NUMBAR AN 3 AN 1
OIC
VISIBLE NUMBAR
DIFFRINT 2 AN SMALLR OF 2 AN NUMBAR, O RLY?
YA RLY, GTFO
OIC
IM OUTTA YR SEQUENZ
KTHXBYE
TESTD UNDR JUSTIN J. MEZA'S INTERPRETR. KTHXBYE!
Python - 95 64 51 46 char
Obviously does not produce a stack overflow.
n=input()
while n>1:n=(n/2,n*3+1)[n%2];print n
Perl
I decided to be a little anticompetitive, and show how you would normally code such problem in Perl.
There is also a 46 (total) char code-golf entry at the end.
These first three examples all start out with this header.
#! /usr/bin/env perl
use Modern::Perl;
# which is the same as these three lines:
# use 5.10.0;
# use strict;
# use warnings;
while( <> ){
chomp;
last unless $_;
Collatz( $_ );
}
Simple recursive version
use Sub::Call::Recur;
sub Collatz{
my( $n ) = #_;
$n += 0; # ensure that it is numeric
die 'invalid value' unless $n > 0;
die 'Integer values only' unless $n == int $n;
say $n;
given( $n ){
when( 1 ){}
when( $_ % 2 != 0 ){ # odd
recur( 3 * $n + 1 );
}
default{ # even
recur( $n / 2 );
}
}
}
Simple iterative version
sub Collatz{
my( $n ) = #_;
$n += 0; # ensure that it is numeric
die 'invalid value' unless $n > 0;
die 'Integer values only' unless $n == int $n;
say $n;
while( $n > 1 ){
if( $n % 2 ){ # odd
$n = 3 * $n + 1;
} else { #even
$n = $n / 2;
}
say $n;
}
}
Optimized iterative version
sub Collatz{
my( $n ) = #_;
$n += 0; # ensure that it is numeric
die 'invalid value' unless $n > 0;
die 'Integer values only' unless $n == int $n;
#
state #next;
$next[1] //= 0; # sets $next[1] to 0 if it is undefined
#
# fill out #next until we get to a value we've already worked on
until( defined $next[$n] ){
say $n;
#
if( $n % 2 ){ # odd
$next[$n] = 3 * $n + 1;
} else { # even
$next[$n] = $n / 2;
}
#
$n = $next[$n];
}
say $n;
# finish running until we get to 1
say $n while $n = $next[$n];
}
Now I'm going to show how you would do that last example with a version of Perl prior to v5.10.0
#! /usr/bin/env perl
use strict;
use warnings;
while( <> ){
chomp;
last unless $_;
Collatz( $_ );
}
{
my #next = (0,0); # essentially the same as a state variable
sub Collatz{
my( $n ) = #_;
$n += 0; # ensure that it is numeric
die 'invalid value' unless $n > 0;
# fill out #next until we get to a value we've already worked on
until( $n == 1 or defined $next[$n] ){
print $n, "\n";
if( $n % 2 ){ # odd
$next[$n] = 3 * $n + 1;
} else { # even
$next[$n] = $n / 2;
}
$n = $next[$n];
}
print $n, "\n";
# finish running until we get to 1
print $n, "\n" while $n = $next[$n];
}
}
Benchmark
First off the IO is always going to be the slow part. So if you actually benchmarked them as-is you should get about the same speed out of each one.
To test these then, I opened a file handle to /dev/null ($null), and edited every say $n to instead read say {$null} $n. This is to reduce the dependence on IO.
#! /usr/bin/env perl
use Modern::Perl;
use autodie;
open our $null, '>', '/dev/null';
use Benchmark qw':all';
cmpthese( -10,
{
Recursive => sub{ Collatz_r( 31 ) },
Iterative => sub{ Collatz_i( 31 ) },
Optimized => sub{ Collatz_o( 31 ) },
});
sub Collatz_r{
...
say {$null} $n;
...
}
sub Collatz_i{
...
say {$null} $n;
...
}
sub Collatz_o{
...
say {$null} $n;
...
}
After having run it 10 times, here is a representative sample output:
Rate Recursive Iterative Optimized
Recursive 1715/s -- -27% -46%
Iterative 2336/s 36% -- -27%
Optimized 3187/s 86% 36% --
Finally, a real code-golf entry:
perl -nlE'say;say$_=$_%2?3*$_+1:$_/2while$_>1'
46 chars total
If you don't need to print the starting value, you could remove 5 more characters.
perl -nE'say$_=$_%2?3*$_+1:$_/2while$_>1'
41 chars total
31 chars for the actual code portion, but the code won't work without the -n switch. So I include the entire example in my count.
Haskell, 62 chars 63 76 83, 86, 97, 137
c 1=[1]
c n=n:c(div(n`mod`2*(5*n+2)+n)2)
main=readLn>>=print.c
User input, printed output, uses constant memory and stack, works with arbitrarily big integers.
A sample run of this code, given an 80 digit number of all '1's (!) as input, is pretty fun to look at.
Original, function only version:
Haskell 51 chars
f n=n:[[],f([n`div`2,3*n+1]!!(n`mod`2))]!!(1`mod`n)
Who the #&^# needs conditionals, anyway?
(edit: I was being "clever" and used fix. Without it, the code dropped to 54 chars.
edit2: dropped to 51 by factoring out f())
Golfscript : 20 chars
~{(}{3*).1&5*)/}/1+`
#
# Usage: echo 21 | ruby golfscript.rb collatz.gs
This is equivalent to
stack<int> s;
s.push(21);
while (s.top() - 1) {
int x = s.top();
int numerator = x*3+1;
int denominator = (numerator&1) * 5 + 1;
s.push(numerator/denominator);
}
s.push(1);
return s;
bc 41 chars
I guess this kind of problems is what bc was invented for:
for(n=read();n>1;){if(n%2)n=n*6+2;n/=2;n}
Test:
bc1 -q collatz.bc
21
64
32
16
8
4
2
1
Proper code:
for(n=read();n>1;){if(n%2)n=n*3+1else n/=2;print n,"\n"}
bc handles numbers with up to INT_MAX digits
Edit: The Wikipedia article mentions this conjecture has been checked for all values up to 20x258 (aprox. 5.76e18). This program:
c=0;for(n=2^20000+1;n>1;){if(n%2)n=n*6+2;n/=2;c+=1};n;c
tests 220,000+1 (aprox. 3.98e6,020) in 68 seconds, 144,404 cycles.
Perl : 31 chars
perl -nE 'say$_=$_%2?$_*3+1:$_/2while$_>1'
# 123456789 123456789 123456789 1234567
Edited to remove 2 unnecessary spaces.
Edited to remove 1 unnecessary space.
MS Excel, 35 chars
=IF(A1/2=ROUND(A1/2,0),A1/2,A1*3+1)
Taken straight from Wikipedia:
In cell A1, place the starting number.
In cell A2 enter this formula =IF(A1/2=ROUND(A1/2,0),A1/2,A1*3+1)
Drag and copy the formula down until 4, 2, 1
It only took copy/pasting the formula 111 times to get the result for a starting number of 1000. ;)
C : 64 chars
main(x){for(scanf("%d",&x);x>=printf("%d,",x);x=x&1?3*x+1:x/2);}
With big integer support: 431 (necessary) chars
#include <stdlib.h>
#define B (w>=m?d=realloc(d,m=m+m):0)
#define S(a,b)t=a,a=b,b=t
main(m,w,i,t){char*d=malloc(m=9);for(w=0;(i=getchar()+2)/10==5;)
B,d[w++]=i%10;for(i=0;i<w/2;i++)S(d[i],d[w-i-1]);for(;;w++){
while(w&&!d[w-1])w--;for(i=w+1;i--;)putchar(i?d[i-1]+48:10);if(
w==1&&*d==1)break;if(*d&1){for(i=w;i--;)d[i]*=3;*d+=1;}else{
for(i=w;i-->1;)d[i-1]+=d[i]%2*10,d[i]/=2;*d/=2;}B,d[w]=0;for(i=0
;i<w;i++)d[i+1]+=d[i]/10,d[i]%=10;}}
Note: Do not remove #include <stdlib.h> without at least prototyping malloc/realloc, as doing so will not be safe on 64-bit platforms (64-bit void* will be converted to 32-bit int).
This one hasn't been tested vigorously yet. It could use some shortening as well.
Previous versions:
main(x){for(scanf("%d",&x);printf("%d,",x),x-1;x=x&1?3*x+1:x/2);} // 66
(removed 12 chars because no one follows the output format... :| )
Another assembler version. This one is not limited to 32 bit numbers, it can handle numbers up to 1065534 although the ".com" format MS-DOS uses is limited to 80 digit numbers. Written for A86 assembler and requires a Win-XP DOS box to run. Assembles to 180 bytes:
mov ax,cs
mov si,82h
add ah,10h
mov es,ax
mov bh,0
mov bl,byte ptr [80h]
cmp bl,1
jbe ret
dec bl
mov cx,bx
dec bl
xor di,di
p1:lodsb
sub al,'0'
cmp al,10
jae ret
stosb
loop p1
xor bp,bp
push es
pop ds
p2:cmp byte ptr ds:[bp],0
jne p3
inc bp
jmp p2
ret
p3:lea si,[bp-1]
cld
p4:inc si
mov dl,[si]
add dl,'0'
mov ah,2
int 21h
cmp si,bx
jne p4
cmp bx,bp
jne p5
cmp byte ptr [bx],1
je ret
p5:mov dl,'-'
mov ah,2
int 21h
mov dl,'>'
int 21h
test byte ptr [bx],1
jz p10
;odd
mov si,bx
mov di,si
mov dx,3
dec bp
std
p6:lodsb
mul dl
add al,dh
aam
mov dh,ah
stosb
cmp si,bp
jnz p6
or dh,dh
jz p7
mov al,dh
stosb
dec bp
p7:mov si,bx
mov di,si
p8:lodsb
inc al
xor ah,ah
aaa
stosb
or ah,ah
jz p9
cmp si,bp
jne p8
mov al,1
stosb
jmp p2
p9:inc bp
jmp p2
p10:mov si,bp
mov di,bp
xor ax,ax
p11:lodsb
test ah,1
jz p12
add al,10
p12:mov ah,al
shr al,1
cmp di,bx
stosb
jne p11
jmp p2
dc - 24 chars 25 28
dc is a good tool for this sequence:
?[d5*2+d2%*+2/pd1<L]dsLx
dc -f collatz.dc
21
64
32
16
8
4
2
1
Also 24 chars using the formula from the Golfscript entry:
?[3*1+d2%5*1+/pd1<L]dsLx
57 chars to meet the specs:
[Number: ]n?[Results: ]ndn[d5*2+d2%*+2/[ -> ]ndnd1<L]dsLx
dc -f collatz-spec.dc
Number: 3
Results: 3 -> 10 -> 5 -> 16 -> 8 -> 4 -> 2 -> 1
Scheme: 72
(define(c n)(if(= n 1)`(1)(cons n(if(odd? n)(c(+(* n 3)1))(c(/ n 2))))))
This uses recursion, but the calls are tail-recursive so I think they'll be optimized to iteration. In some quick testing, I haven't been able to find a number for which the stack overflows anyway. Just for example:
(c 9876543219999999999000011234567898888777766665555444433332222
7777777777777777777777777777777798797657657651234143375987342987
5398709812374982529830983743297432985230985739287023987532098579
058095873098753098370938753987)
...runs just fine. [that's all one number -- I've just broken it to fit on screen.]
Mathematica, 45 50 chars
c=NestWhileList[If[OddQ##,3#+1,#/2]&,#,#>1&]&
Ruby, 50 chars, no stack overflow
Basically a direct rip of makapuf's Python solution:
def c(n)while n>1;n=n.odd?? n*3+1: n/2;p n end end
Ruby, 45 chars, will overflow
Basically a direct rip of the code provided in the question:
def c(n)p n;n.odd?? c(3*n+1):c(n/2)if n>1 end
import java.math.BigInteger;
public class SortaJava {
static final BigInteger THREE = new BigInteger("3");
static final BigInteger TWO = new BigInteger("2");
interface BiFunc<R, A, B> {
R call(A a, B b);
}
interface Cons<A, B> {
<R> R apply(BiFunc<R, A, B> func);
}
static class Collatz implements Cons<BigInteger, Collatz> {
BigInteger value;
public Collatz(BigInteger value) { this.value = value; }
public <R> R apply(BiFunc<R, BigInteger, Collatz> func) {
if(BigInteger.ONE.equals(value))
return func.call(value, null);
if(value.testBit(0))
return func.call(value, new Collatz((value.multiply(THREE)).add(BigInteger.ONE)));
return func.call(value, new Collatz(value.divide(TWO)));
}
}
static class PrintAReturnB<A, B> implements BiFunc<B, A, B> {
boolean first = true;
public B call(A a, B b) {
if(first)
first = false;
else
System.out.print(" -> ");
System.out.print(a);
return b;
}
}
public static void main(String[] args) {
BiFunc<Collatz, BigInteger, Collatz> printer = new PrintAReturnB<BigInteger, Collatz>();
Collatz collatz = new Collatz(new BigInteger(args[0]));
while(collatz != null)
collatz = collatz.apply(printer);
}
}
Python 45 Char
Shaved a char off of makapuf's answer.
n=input()
while~-n:n=(n/2,n*3+1)[n%2];print n
TI-BASIC
Not the shortest, but a novel approach. Certain to slow down considerably with large sequences, but it shouldn't overflow.
PROGRAM:COLLATZ
:ClrHome
:Input X
:Lbl 1
:While X≠1
:If X/2=int(X/2)
:Then
:Disp X/2→X
:Else
:Disp X*3+1→X
:End
:Goto 1
:End
Haskell : 50
c 1=[1];c n=n:(c$if odd n then 3*n+1 else n`div`2)
not the shortest, but an elegant clojure solution
(defn collatz [n]
(print n "")
(if (> n 1)
(recur
(if (odd? n)
(inc (* 3 n))
(/ n 2)))))
C#: 216 Characters
using C=System.Console;class P{static void Main(){var p="start:";System.Action<object> o=C.Write;o(p);ulong i;while(ulong.TryParse(C.ReadLine(),out i)){o(i);while(i > 1){i=i%2==0?i/2:i*3+1;o(" -> "+i);}o("\n"+p);}}}
in long form:
using C = System.Console;
class P
{
static void Main()
{
var p = "start:";
System.Action<object> o = C.Write;
o(p);
ulong i;
while (ulong.TryParse(C.ReadLine(), out i))
{
o(i);
while (i > 1)
{
i = i % 2 == 0 ? i / 2 : i * 3 + 1;
o(" -> " + i);
}
o("\n" + p);
}
}
}
New Version, accepts one number as input provided through the command line, no input validation. 173 154 characters.
using System;class P{static void Main(string[]a){Action<object>o=Console.Write;var i=ulong.Parse(a[0]);o(i);while(i>1){i=i%2==0?i/2:i*3+1;o(" -> "+i);}}}
in long form:
using System;
class P
{
static void Main(string[]a)
{
Action<object>o=Console.Write;
var i=ulong.Parse(a[0]);
o(i);
while(i>1)
{
i=i%2==0?i/2:i*3+1;
o(" -> "+i);
}
}
}
I am able to shave a few characters by ripping off the idea in this answer to use a for loop rather than a while. 150 characters.
using System;class P{static void Main(string[]a){Action<object>o=Console.Write;for(var i=ulong.Parse(a[0]);i>1;i=i%2==0?i/2:i*3+1)o(i+" -> ");o(1);}}
Ruby, 43 characters
bignum supported, with stack overflow susceptibility:
def c(n)p n;n%2>0?c(3*n+1):c(n/2)if n>1 end
...and 50 characters, bignum supported, without stack overflow:
def d(n)while n>1 do p n;n=n%2>0?3*n+1:n/2 end end
Kudos to Jordan. I didn't know about 'p' as a replacement for puts.
nroff1
Run with nroff -U hail.g
.warn
.pl 1
.pso (printf "Enter a number: " 1>&2); read x; echo .nr x $x
.while \nx>1 \{\
. ie \nx%2 .nr x \nx*3+1
. el .nr x \nx/2
\nx
.\}
1. groff version
Scala + Scalaz
import scalaz._
import Scalaz._
val collatz =
(_:Int).iterate[Stream](a=>Seq(a/2,3*a+1)(a%2)).takeWhile(1<) // This line: 61 chars
And in action:
scala> collatz(7).toList
res15: List[Int] = List(7, 22, 11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2)
Scala 2.8
val collatz =
Stream.iterate(_:Int)(a=>Seq(a/2,3*a+1)(a%2)).takeWhile(1<) :+ 1
This also includes the trailing 1.
scala> collatz(7)
res12: scala.collection.immutable.Stream[Int] = Stream(7, 22, 11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2, 1)
With the following implicit
implicit def intToEven(i:Int) = new {
def ~(even: Int=>Int, odd: Int=>Int) = {
if (i%2==0) { even(i) } else { odd(i) }
}
}
this can be shortened to
val collatz = Stream.iterate(_:Int)(_~(_/2,3*_+1)).takeWhile(1<) :+ 1
Edit - 58 characters (including input and output, but not including initial number)
var n=readInt;while(n>1){n=Seq(n/2,n*3+1)(n%2);println(n)}
Could be reduced by 2 if you don't need newlines...
F#, 90 characters
let c=Seq.unfold(function|n when n<=1->None|n when n%2=0->Some(n,n/2)|n->Some(n,(3*n)+1))
> c 21;;
val it : seq<int> = seq [21; 64; 32; 16; ...]
Or if you're not using F# interactive to display the result, 102 characters:
let c=Seq.unfold(function|n when n<=1->None|n when n%2=0->Some(n,n/2)|n->Some(n,(3*n)+1))>>printf"%A"
Common Lisp, 141 characters:
(defun c ()
(format t"Number: ")
(loop for n = (read) then (if(oddp n)(+ 1 n n n)(/ n 2))
until (= n 1)
do (format t"~d -> "n))
(format t"1~%"))
Test run:
Number: 171
171 -> 514 -> 257 -> 772 -> 386 -> 193 -> 580 -> 290 -> 145 -> 436 ->
218 -> 109 -> 328 -> 164 -> 82 -> 41 -> 124 -> 62 -> 31 -> 94 -> 47 ->
142 -> 71 -> 214 -> 107 -> 322 -> 161 -> 484 -> 242 -> 121 -> 364 ->
182 -> 91 -> 274 -> 137 -> 412 -> 206 -> 103 -> 310 -> 155 -> 466 ->
233 -> 700 -> 350 -> 175 -> 526 -> 263 -> 790 -> 395 -> 1186 -> 593 ->
1780 -> 890 -> 445 -> 1336 -> 668 -> 334 -> 167 -> 502 -> 251 -> 754 ->
377 -> 1132 -> 566 -> 283 -> 850 -> 425 -> 1276 -> 638 -> 319 ->
958 -> 479 -> 1438 -> 719 -> 2158 -> 1079 -> 3238 -> 1619 -> 4858 ->
2429 -> 7288 -> 3644 -> 1822 -> 911 -> 2734 -> 1367 -> 4102 -> 2051 ->
6154 -> 3077 -> 9232 -> 4616 -> 2308 -> 1154 -> 577 -> 1732 -> 866 ->
433 -> 1300 -> 650 -> 325 -> 976 -> 488 -> 244 -> 122 -> 61 -> 184 ->
92 -> 46 -> 23 -> 70 -> 35 -> 106 -> 53 -> 160 -> 80 -> 40 -> 20 ->
10 -> 5 -> 16 -> 8 -> 4 -> 2 -> 1
The program frm Jerry Coffin has integer over flow, try this one:
#include <iostream>
int main(unsigned long long i)
{
int j = 0;
for( std::cin>>i; i>1; i = i&1? i*3+1:i/2, ++j)
std::cout<<i<<" -> ";
std::cout<<"\n"<<j << " iterations\n";
}
tested with
The number less than 100 million with the longest total stopping time is 63,728,127, with 949 steps.
The number less than 1 billion with the longest total stopping time is 670,617,279, with 986 steps.
ruby, 43, possibly meeting the I/O requirement
Run with ruby -n hail
n=$_.to_i
(n=n%2>0?n*3+1: n/2
p n)while n>1
C# : 659 chars with BigInteger support
using System.Linq;using C=System.Console;class Program{static void Main(){var v=C.ReadLine();C.Write(v);while(v!="1"){C.Write("->");if(v[v.Length-1]%2==0){v=v.Aggregate(new{s="",o=0},(r,c)=>new{s=r.s+(char)((c-48)/2+r.o+48),o=(c%2)*5}).s.TrimStart('0');}else{var q=v.Reverse().Aggregate(new{s="",o=0},(r, c)=>new{s=(char)((c-48)*3+r.o+(c*3+r.o>153?c*3+r.o>163?28:38:48))+r.s,o=c*3+r.o>153?c*3+r.o>163?2:1:0});var t=(q.o+q.s).TrimStart('0').Reverse();var x=t.First();q=t.Skip(1).Aggregate(new{s=x>56?(x-57).ToString():(x-47).ToString(),o=x>56?1:0},(r,c)=>new{s=(char)(c-48+r.o+(c+r.o>57?38:48))+r.s,o=c+r.o>57?1:0});v=(q.o+q.s).TrimStart('0');}C.Write(v);}}}
Ungolfed
using System.Linq;
using C = System.Console;
class Program
{
static void Main()
{
var v = C.ReadLine();
C.Write(v);
while (v != "1")
{
C.Write("->");
if (v[v.Length - 1] % 2 == 0)
{
v = v
.Aggregate(
new { s = "", o = 0 },
(r, c) => new { s = r.s + (char)((c - 48) / 2 + r.o + 48), o = (c % 2) * 5 })
.s.TrimStart('0');
}
else
{
var q = v
.Reverse()
.Aggregate(
new { s = "", o = 0 },
(r, c) => new { s = (char)((c - 48) * 3 + r.o + (c * 3 + r.o > 153 ? c * 3 + r.o > 163 ? 28 : 38 : 48)) + r.s, o = c * 3 + r.o > 153 ? c * 3 + r.o > 163 ? 2 : 1 : 0 });
var t = (q.o + q.s)
.TrimStart('0')
.Reverse();
var x = t.First();
q = t
.Skip(1)
.Aggregate(
new { s = x > 56 ? (x - 57).ToString() : (x - 47).ToString(), o = x > 56 ? 1 : 0 },
(r, c) => new { s = (char)(c - 48 + r.o + (c + r.o > 57 ? 38 : 48)) + r.s, o = c + r.o > 57 ? 1 : 0 });
v = (q.o + q.s)
.TrimStart('0');
}
C.Write(v);
}
}
}