AWK. Swap fields in a csv file - csv

The csv file contains nine fields. Fields $ 1, $ 8 and $ 9 must be respected. Add fields $ 2, $ 3, $ 4, $ 5, $ 6 and $ 7 and replace them in lines where the field $1 is repeated. It is hard to describe the rules.
I have to finish this or do something like this. I need Standalone script.
BEGIN{
FS=";"
OFS="";
x="\"\"";
}
{
for(i=2;i<=7;i++)
if($i!= x)
{
k=match(a[$1], $i);
if (k == 0)
{
a[$1]=a[$1]";"$i;
}
b[$1]=b[$1]"-"$8""FS""$9;
}
END {
for (g in a)
t=split(a[g], A, ";");
if (t == 2)
{
a[g]=a[g]";"x";"x";"x";"x";"x";";
}
if (t == 3)
{
a[g]=a[g]";"x";"x";"x";"x";";
}
if (t == 4)
{
a[g]=a[g]";"x";"x";"x";";
}
if (t == 5)
{
a[g]=a[g]";"x";"x";";
}
for (h in b)
q=split(b[h], B, "-");
for (z=1; z <= q; z++)
b[h]=B[z];
}
}
CSV File:
"1033reto";"V09B";"";"";"";"";"";"QVN";"V09B"
"1033reto";"V010";"";"";"";"";"";"QVN";"V010"
"1033reto";"V015";"";"";"";"";"";"QVN";"V015"
"1033reto";"V08C";"";"";"";"";"";"QVN";"V08C"
"1040reto";"V03D";"";"";"";"";"";"QVN";"V03D"
"1040reto";"V01C";"";"";"";"";"";"QVN";"V01C"
"1050reto";"V03D";"";"";"";"";"";"QVN";"V03D"
"1050reto";"V01F";"V07L";"";"";"";"";"QVN";"V01C"
Desired Output
"1033reto";"V09B";"V010";"V015";"V08C";"";"QVN";"V09B"
"1033reto";"V09B";"V010";"V015";"V08C";"";"QVN";"V010"
"1033reto";"V09B";"V010";"V015";"V08C";"";"QVN";"V015"
"1033reto";"V09B";"V010";"V015";"V08C";"";"QVN";"V08C"
"1040reto";"V03D";"V01C";"";"";"";"";"QVN";"V03D"
"1040reto";"V03D";"V01C";"";"";"";"";"QVN";"V01C"
"1050reto";"V03D";"V01F";"V07L";"";"";"";"QVN";"V03D"
"1050reto";"V03D";"V01F";"V07L";"";"";"";"QVN";"V01C"

By studying the Karakfa code I managed to find an alternative without double bypass.
BEGIN{
FS=";"
x="\"\"";
}
{
for(i=2;i<=7;i++)
{
if($i!= x)
{
k=match(a[$1], $i);
if (k == 0)
{
a[$1]=a[$1]";"$i;
}
}
}
b[$1]=b[$1]"-"$8""FS""$9;
}
END {
for (g in a)
{
sub("^;","",a[g]);
t=split(a[g], A, ";");
for (y=t; y<6; y++)
{
a[g]=a[g]";"x;
}
mx=split(b[g], B, "-");
for (i=2; i<=mx; i++)
{
print g""FS""a[g]""FS""B[i];
}
}
}

Related

How to use captured lifetime token in function parameter

Here is the code:
struct A<'a, 'b> {
s1: &'a str,
s2: &'b str,
}
struct B<'a> {
s3: &'a str,
}
macro_rules! gen_fn {
( $name:ident, $param:ty, < $($gen:tt),+ > ) => {
fn $name< $($gen),+ >(param: $param< $($gen),+ >) {
todo!()
}
}
}
fn main() {
gen_fn!( op_a, A, <'a, 'b>);
gen_fn!( op_b, B, <'a> );
}
However this code can't compile because Rust consider $param as a single argument, and $gen is another argument, so it expect token , or ).
When I try to expand the macro, the code is generated as I expect:
fn op_a<'a, 'b>(param: A<'a, 'b>) {
$crate::panicking::panic("not yet implemented")
}
fn op_b<'a>(param: B<'a>) {
$crate::panicking::panic("not yet implemented")
}
Am I doing wrong or it is just unsupported.
You can use ident instead of ty, but only if you're not using paths (a::b):
macro_rules! gen_fn {
( $name:ident, $param:ident, < $($gen:tt),+ > ) => {
fn $name< $($gen),+ >(param: $param< $($gen),+ >) {
todo!()
}
}
}
If you're using paths, you can use a repeated ident:
macro_rules! gen_fn {
( $name:ident, $($param:ident)::+, < $($gen:tt),+ > ) => {
fn $name< $($gen),+ >(param: $($param)::+< $($gen),+ >) {
todo!()
}
}
}

How to print JSON objects in AWK

I was looking for some built-in functions inside awk to easily generate JSON objects. I came across several answers and decided to create my own.
I'd like to generate JSON from multidimensional arrays, where I store table style data, and to use separate and dynamic definition of JSON schema to be generated from that data.
Desired output:
{
"Name": JanA
"Surname": NowakA
"ID": 1234A
"Role": PrezesA
}
{
"Name": JanD
"Surname": NowakD
"ID": 12341D
"Role": PrezesD
}
{
"Name": JanC
"Surname": NowakC
"ID": 12342C
"Role": PrezesC
}
Input file:
pierwsza linia
druga linia
trzecia linia
dane wspólników
imie JanA
nazwisko NowakA
pesel 11111111111A
funkcja PrezesA
imie Ja"nD
nazwisko NowakD
pesel 11111111111
funkcja PrezesD
imie JanC
nazwisko NowakC
pesel 12342C
funkcja PrezesC
czwarta linia
reprezentanci
imie Tomek
Based on input file i created a multidimensional array:
JanA NowaA 1234A PrezesA
JanD NowakD 12341D PrezesD
JanC NowakC 12342C PrezesC
I'll take a stab at a gawk solution. The indenting isn't perfect and the results aren't ordered (see "Sorting" note below), but it's at least able to walk a true multidimensional array recursively and should produce valid, parsable JSON from any array. Bonus: the data array is the schema. Array keys become JSON keys. There's no need to create a separate schema array in addition to the data array.
Just be sure to use the true multidimensional array[d1][d2][d3]... convention of constructing your data array, rather than the concatenated index array[d1,d2,d3...] convention.
Update:
I've got an updated JSON gawk script posted as a GitHub Gist. Although the script below is tested as working with OP's data, I might've made improvements since this post was last edited. Please see the Gist for the most thoroughly tested, bug-squashed version.
#!/usr/bin/gawk -f
BEGIN { IGNORECASE = 1 }
$1 ~ "imie" { record[++idx]["name"] = $2 }
$1 ~ "nazwisko" { record[idx]["surname"] = $2 }
$1 ~ "pesel" { record[idx]["ID"] = $2 }
$1 ~ "funkcja" { record[idx]["role"] = $2 }
END { print serialize(record, "\t") }
# ==== FUNCTIONS ====
function join(arr, sep, _p, i) {
# syntax: join(array, string separator)
# returns a string
for (i in arr) {
_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i]
}
return _p["result"]
}
function quote(str) {
gsub(/\\/, "\\\\", str)
gsub(/\r/, "\\r", str)
gsub(/\n/, "\\n", str)
gsub(/\t/, "\\t", str)
return "\"" str "\""
}
function serialize(arr, indent_with, depth, _p, i, idx) {
# syntax: serialize(array of arrays, indent string)
# returns a JSON formatted string
# sort arrays on key, ensures [...] values remain properly ordered
if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "#ind_num_asc"
# determine whether array is indexed or associative
for (i in arr) {
_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr))
}
# if associative, indent
if (_p["assoc"]) {
for (i = ++depth; i--;) {
_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with
}
}
for (i in arr) {
# If key length is 0, assume its an empty object
if (!length(i)) return "{}"
# quote key if not already quoted
_p["key"] = i !~ /^".*"$/ ? quote(i) : i
if (isarray(arr[i])) {
if (_p["assoc"]) {
_p["json"][++idx] = _p["indent"] _p["key"] ": " \
serialize(arr[i], indent_with, depth)
} else {
# if indexed array, dont print keys
_p["json"][++idx] = serialize(arr[i], indent_with, depth)
}
} else {
# quote if not numeric, boolean, null, already quoted, or too big for match()
if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) ||
arr[i] ~ /^true|false|null|".*"$/) || length(arr[i]) > 1000)
arr[i] = quote(arr[i])
_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i]
}
}
# I trial and errored the hell out of this. Problem is, gawk cant distinguish between
# a value of null and no value. I think this hack is as close as I can get, although
# [""] will become [].
if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]"
# surround with curly braces if object, square brackets if array
return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \
: "[" join(_p["json"], ", ") "]"
}
Output resulting from OP's example data:
[{
"ID": "1234A",
"name": "JanA",
"role": "PrezesA",
"surname": "NowakA"
}, {
"ID": "12341D",
"name": "JanD",
"role": "PrezesD",
"surname": "NowakD"
}, {
"ID": "12342C",
"name": "JanC",
"role": "PrezesC",
"surname": "NowakC"
}, {
"name": "Tomek"
}]
Sorting
Although the results by default are ordered in a manner only gawk understands, it is possible for gawk to sort the results on a field. If you'd like to sort on the ID field for example, add this function:
function cmp_ID(i1, v1, i2, v2) {
if (!isarray(v1) && v1 ~ /"ID"/ ) {
return v1 < v2 ? -1 : (v1 != v2)
}
}
Then insert this line within your END section above print serialize(record):
PROCINFO["sorted_in"] = "cmp_ID"
See Controlling Array Traversal for more information.
My updated awk implementation of simple array printer with regex based validation for each column(running using gawk):
function ltrim(s) { sub(/^[ \t]+/, "", s); return s }
function rtrim(s) { sub(/[ \t]+$/, "", s); return s }
function sTrim(s){
return rtrim(ltrim(s));
}
function jsonEscape(jsValue) {
gsub(/\\/, "\\\\", jsValue)
gsub(/"/, "\\\"", jsValue)
gsub(/\b/, "\\b", jsValue)
gsub(/\f/, "\\f", jsValue)
gsub(/\n/, "\\n", jsValue)
gsub(/\r/, "\\r", jsValue)
gsub(/\t/, "\\t", jsValue)
return jsValue
}
function jsonStringEscapeAndWrap(jsValue) {
return "\42" jsonEscape(jsValue) "\42"
}
function jsonPrint(contentArray, contentRowsCount, schemaArray){
result = ""
schemaLength = length(schemaArray)
for (x = 1; x <= contentRowsCount; x++) {
result = result "{"
for(y = 1; y <= schemaLength; y++){
result = result "\42" sTrim(schemaArray[y]) "\42:" sTrim(contentArray[x, y])
if(y < schemaLength){
result = result ","
}
}
result = result "}"
if(x < contentRowsCount){
result = result ",\n"
}
}
return result
}
function jsonValidateAndPrint(contentArray, contentRowsCount, schemaArray, schemaColumnsCount, errorArray){
result = ""
errorsCount = 1
for (x = 1; x <= contentRowsCount; x++) {
jsonRow = "{"
for(y = 1; y <= schemaColumnsCount; y++){
regexValue = schemaArray[y, 2]
jsonValue = sTrim(contentArray[x, y])
isValid = jsonValue ~ regexValue
if(isValid == 0){
errorArray[errorsCount, 1] = "\42" sTrim(schemaArray[y, 1]) "\42"
errorArray[errorsCount, 2] = "\42Value " jsonValue " not match format: " regexValue " \42"
errorArray[errorsCount, 3] = x
errorsCount++
jsonValue = "null"
}
jsonRow = jsonRow "\42" sTrim(schemaArray[y, 1]) "\42:" jsonValue
if(y < schemaColumnsCount){
jsonRow = jsonRow ","
}
}
jsonRow = jsonRow "}"
result = result jsonRow
if(x < contentRowsCount){
result = result ",\n"
}
}
return result
}
BEGIN{
rowsCount =1
matchCount = 0
errorsCount = 0
shareholdersJsonSchema[1, 1] = "Imie"
shareholdersJsonSchema[2, 1] = "Nazwisko"
shareholdersJsonSchema[3, 1] = "PESEL"
shareholdersJsonSchema[4, 1] = "Funkcja"
shareholdersJsonSchema[1, 2] = "\\.*"
shareholdersJsonSchema[2, 2] = "\\.*"
shareholdersJsonSchema[3, 2] = "^[0-9]{11}$"
shareholdersJsonSchema[4, 2] = "\\.*"
errorsSchema[1] = "PropertyName"
errorsSchema[2] = "Message"
errorsSchema[3] = "PositionIndex"
resultSchema[1]= "ShareHolders"
resultSchema[2]= "Errors"
}
/dane wspólników/,/czwarta linia/{
if(/imie/ || /nazwisko/ || /pesel/ || /funkcja/){
if(/imie/){
shareholdersArray[rowsCount, 1] = jsonStringEscapeAndWrap($2)
matchCount++
}
if(/nazwisko/){
shareholdersArray[rowsCount, 2] = jsonStringEscapeAndWrap($2)
matchCount ++
}
if(/pesel/){
shareholdersArray[rowsCount, 3] = $2
matchCount ++
}
if(/funkcja/){
shareholdersArray[rowsCount, 4] = jsonStringEscapeAndWrap($2)
matchCount ++
}
if(matchCount==4){
rowsCount++
matchCount = 0;
}
}
}
END{
shareHolders = jsonValidateAndPrint(shareholdersArray, rowsCount - 1, shareholdersJsonSchema, 4, errorArray)
shareHoldersErrors = jsonPrint(errorArray, length(errorArray) / length(errorsSchema), errorsSchema)
resultArray[1,1] = "\n[\n" shareHolders "\n]\n"
resultArray[1,2] = "\n[\n" shareHoldersErrors "\n]\n"
resultJson = jsonPrint(resultArray, 1, resultSchema)
print resultJson
}
Produces output:
{"ShareHolders":
[
{"Imie":"JanA","Nazwisko":"NowakA","PESEL":null,"Funkcja":"PrezesA"},
{"Imie":"Ja\"nD","Nazwisko":"NowakD","PESEL":11111111111,"Funkcja":"PrezesD"},
{"Imie":"JanC","Nazwisko":"NowakC","PESEL":null,"Funkcja":"PrezesC"}
]
,"Errors":
[
{"PropertyName":"PESEL","Message":"Value 11111111111A not match format: ^[0-9]{11}$ ","PositionIndex":1},
{"PropertyName":"PESEL","Message":"Value 12342C not match format: ^[0-9]{11}$ ","PositionIndex":3}
]
}

How add CHAR(") to string using c or python?

Now I have a LIKE JSON(NOT JSON) string which saved file:
{"content":["info":{"tid":(uint)123,"pid":(int)456}],"header":{"test":"hello"}}
For compare two this string with python json, I need to format this string, because it decoded wrong using python json, it can run right if (unit)123 were "(uint)123". Now I have written code using c, the bellow:
void dofile(char *filename)
{
FILE *f;
long len;
char *data;
char *head;
char *ptr;
char *value;
f=fopen(filename,"rb");
fseek(f,0,SEEK_END);
len=ftell(f);
fseek(f,0,SEEK_SET);
data=(char*)malloc(len+1);
fread(data,1,len,f);
data[len]='\0';
fclose(f);
head = data;
ptr = data;
while (*ptr)
{
if (*ptr++ == '\\')
continue;
int l = 1;
if (*ptr=='\"' && *++ptr==':' && *++ptr=='(')
{
value = ptr;
while (*ptr++ != ')' && *ptr && ++l)
;
ptr++;
len++;
if (*ptr == '-')
{
ptr++;
l++;
}
while(*ptr >= '0' && *ptr <= '9' && *ptr && ++l)
{
ptr++;
}
char *tmp = (char*)malloc(len+2);
memcpy(tmp, head, value-head);
memset(tmp+(value-head), '\"', 1);
memcpy(tmp+(value-head)+1, value, ptr-value);
memset(tmp+(ptr-head)+1, '\"', 1);
memcpy(tmp+(ptr-head)+2, ptr, len-(ptr-head));
len += 2;
ptr = tmp + (ptr-head)+2;
free(data);
data = tmp;
head = data;
}
}
f = fopen(filename, "wb");
fwrite(data, len-1, 1, f);
fclose(f);
printf("%s\n", data);
free(data);
}
OK, it is OK, but has it problem? Because I think that I use many mem function.

Purity of Phobos reduce

Why isn't std.algorithm.reduce in Phobos pure? Is it an unfixed Issue or is there a reason why it can't be?
Has this something todo with the question:
"What does a pure function look like"
Andrei asked in the final lecture at DConf 2013?
See: http://forum.dlang.orgthread/20120306224101.GA30389#quickfur.ath.cx
I want the function sparseness in the following code to be pure. I guess I could always replace reduce with a foreach loop for now right?
import std.algorithm: reduce, min, max;
import std.typetuple: templateAnd;
import std.traits: isArray, Unqual;
import std.range: ElementType, isInputRange, isBidirectionalRange, isFloatingPoint;
//** Returns: true if $(D a) is set to the default value of its type. */
bool defaulted(T)(T x) #safe pure nothrow { return x == T.init; }
alias defaulted untouched;
/** Returns: Number of Default-Initialized (Zero) Elements in $(D range). */
size_t sparseness(T)(in T x, int recurseDepth = -1) #trusted /* pure nothrow */ {
import std.traits: isStaticArray;
static if (isStaticArray!T ||
isInputRange!T) {
import std.range: empty;
immutable isEmpty = x.empty;
if (isEmpty || recurseDepth == 0) {
return isEmpty;
} else {
const nextDepth = (recurseDepth == -1 ?
recurseDepth :
recurseDepth - 1);
static if (isStaticArray!T) { // TODO: We can't algorithms be applied to static arrays?
typeof(return) ret;
foreach (ref elt; x) { ret += elt.sparseness(nextDepth); }
return ret;
} else {
import std.algorithm: map, reduce;
return reduce!"a+b"(x.map!(a => a.sparseness(nextDepth)));
}
}
} else static if (isFloatingPoint!T) {
return x == 0; // explicit zero because T.init is nan here
} else {
return x.defaulted;
}
}
unittest {
assert(1.sparseness == 0);
assert(0.sparseness == 1);
assert(0.0.sparseness == 1);
assert(0.1.sparseness == 0);
assert(0.0f.sparseness == 1);
assert(0.1f.sparseness == 0);
assert("".sparseness == 1);
assert(null.sparseness == 1);
immutable ubyte[3] x3 = [1, 2, 3]; assert(x3[].sparseness == 0);
immutable float[3] f3 = [1, 2, 3]; assert(f3[].sparseness == 0);
immutable ubyte[2][2] x22 = [0, 1, 0, 1]; assert(x22[].sparseness == 2);
immutable ubyte[2][2] x22z = [0, 0, 0, 0]; assert(x22z[].sparseness == 4);
}
Update:
I decided on instead using isIterable and foreach instead of the above, as this works just aswell for me right now and makes things #safe pure nothrow. I see no need right now to use higher order functions to solve this problem. I also found Davids Simchas' upcoming std.rational very natural to use here:
import rational: Rational;
/** Returns: Number of Default-Initialized (Zero) Elements in $(D x) at
recursion depth $(D depth).
*/
Rational!ulong sparseness(T)(in T x, int depth = -1) #safe pure nothrow {
alias R = typeof(return); // rational shorthand
static if (isIterable!T) {
import std.range: empty;
immutable isEmpty = x.empty;
if (isEmpty || depth == 0) {
return R(isEmpty, 1);
} else {
immutable nextDepth = (depth == -1 ? depth : depth - 1);
ulong nums, denoms;
foreach (ref elt; x) {
auto sub = elt.sparseness(nextDepth);
nums += sub.numerator;
denoms += sub.denominator;
}
return R(nums, denoms);
}
} else static if (isFloatingPoint!T) {
return R(x == 0, 1); // explicit zero because T.init is nan here
} else {
return R(x.defaulted, 1);
}
}
If you change nextDepth to immutable rather than const then sparseness will be pure.
I believe this is a bug, it may be to do with the closure being passed to reduce capturing nextDepth, and for some reason thinking it may be mutable because it is const. Values declared as const are however identical to those declared as immutable -- the difference only manifests itself with indirections -- so I believe it is an error.
You may want to file a minimal repro case as a bug.
(it cannot be nothrow however, because reduce can, in fact, throw)

awk to translate config file to json

I have a config file like this one:
[sectionOne]
key1_1=value1_1
key1_n=value1_n
#this is a comment
[sectionTwo]
key2_1=value2_1
key2_n=value2_n
;this is a comment also
[SectionThree]
key3_1=value3_1
key3_n=value3_n
[SectionFor]
...
I need to translate this into json, using minimal shell tools (no perl,python,php, just sed,awk available)
The desired output is :
[
{"sectionOne": { "key1_1": "value1_1","key1_n": "value1_n"} },
{"sectionTwo": { "key2_1": "value2_1","key2_n": "value2_n"} },
{"sectionThree": { "key3_1": "value3_1","key3_n": "value3_n"}}
....
]
I tried several ways/hours, no success
Thank you in advance
There's some inconsistencies between your sample input and desired output so it's hard to be sure but this should be close and easy to tweak if not 100% what you want:
$ cat file
[sectionOne]
key1_1=value1_1
key1_n=value1_n
#this is a comment
[sectionTwo]
key2_1=value2_1
key2_n=value2_n
;this is a comment also
[SectionThree]
key3_1=value3_1
key3_n=value3_n
$
$ cat tst.awk
BEGIN{
FS="="
print "["
}
/^([#;]|[[:space:]]*$)/ {
next
}
gsub(/[][]/,"") {
printf "%s{\"%s\": { ", rs, $0
rs="} },\n"
fs=""
next
}
{
printf "%s\"%s\": \"%s\"", fs, $1, $2
fs=","
}
END{
print rs "]"
}
$
$ awk -f tst.awk file
[
{"sectionOne": { "key1_1": "value1_1","key1_n": "value1_n"} },
{"sectionTwo": { "key2_1": "value2_1","key2_n": "value2_n"} },
{"SectionThree": { "key3_1": "value3_1","key3_n": "value3_n"} },
]
awk 'BEGIN{ print "[" }
/^[#;]/{ next } # Ignore comments
/^\[/{ gsub( "[][]", "" ); printf "%s{\"%s\": { ", s ? "}},\n" : "", $0; n=0; s=1 }
/=/ { gsub( "=", "\":\"" ); printf "%c\"%s\" ", n ? "," : "", $0; n=1 }
END{ print "}}\n]" }
'
Here's a solution in bash using awk:
#!/bin/bash
awk -F"=" 'BEGIN{in_section=0; first_field=0; printf "["}
{
last=length($1);
if ( (substr($1,1,1) == "[") && (substr($1,last, last) == "]")) {
if (in_section==1) {
printf "} },";
}
section=substr($1, 2, last-2);
printf "\n{\"%s\":", section;
printf " {";
first_field=1;
in_section=1;
} else if ( substr($1, 1, 1) == "#" || substr($1, 1, 1) == ";"){
} else if ( ($1 != "") && ($2 != "") ) {
if (first_field==0) {
printf ", ";
}
printf "\"%s\": \"%s\"", $1, $2;
first_field=0;
}
}
END{printf "} }\n]\n"}'