Dynamically edit lists within a csv file - mysql

I have a csv file that looks like that:
col1|col2
1|a
2|g
3|f
1|m
3|k
2|n
2|a
1|d
4|r
3|s
where | separates the columns, and would like to transform it into something homogeneous like:
------------------------
fields > 1 2 3 4
record1 a g f
record2 m n k
record3 d a s r
------------------------
Is there a way to do that? What would be better, using mysql or editing the csv file?

I wrote this, works for your example: gawk is required
awk -F'|' -v RS="" '{for(i=1;i<=NF;i+=2)a[$i]=$(i+1);asorti(a,d);
for(i=1;i<=length(a);i++)printf "%s", a[d[i]]((i==length(a))?"":" ");delete a;delete d;print ""}' file
example:
kent$ cat file
1|a
2|g
3|f
1|m
3|k
2|n
2|a
1|d
4|r
3|s
kent$ awk -F'|' -v RS="" '{for(i=1;i<=NF;i+=2)a[$i]=$(i+1);asorti(a,d);
for(i=1;i<=length(a);i++)printf "%s", a[d[i]]((i==length(a))?"":" ");delete a;delete d;print ""}' file
a g f
m n k
d a s r

Here an awk solution:
BEGIN{
RS=""
FS="\n"
}
FNR==NR&&FNR>1{
for (i=1;i<=NF;i++) {
split($i,d,"|")
if (d[1] > max)
max = d[1]
}
next
}
FNR>1&&!header{
printf "%s\t","fields >"
for (i=1;i<=max;i++)
printf "%s\t",i
print ""
header=1
}
FNR>1{
printf "record%s\t\t",FNR-1
for (i=1;i<=NF;i++) {
split($i,d,"|")
val[d[1]] = d[2]
}
for (i=1;i<=max;i++)
printf "%s\t",val[i]?val[i]:"NULL"
print ""
delete val
}
Save as script.awk and run like (notice it uses a two pass approach so you need to give the file twice):
$ awk -f script.awk file file
fields > 1 2 3 4
record1 a g f NULL
record2 m n k NULL
record3 d a s r
Adding the line 5|b to the first record in file gives the output:
$ awk -f script.awk file file
fields > 1 2 3 4 5
record1 a g f NULL b
record2 m n k NULL NULL
record3 d a s r NULL

$ cat file
col1|col2
1|a
2|g
3|f
5|b
1|m
3|k
2|n
2|a
1|d
4|r
3|s
$
$ awk -f tst.awk file
fields > 1 2 3 4 5
record1 a g f NULL b
record2 m n k NULL NULL
record3 d a s r NULL
$
$ cat tst.awk
BEGIN{ RS=""; FS="\n" }
NR>1 {
++numRecs
for (i=1;i<=NF;i++) {
split($i,fldNr2val,"|")
fldNr = fldNr2val[1]
val = fldNr2val[2]
recNrFldNr2val[numRecs,fldNr] = val
numFlds = (fldNr > numFlds ? fldNr : numFlds)
}
}
END {
printf "fields >"
for (fldNr=1;fldNr<=numFlds;fldNr++) {
printf " %4s", fldNr
}
print ""
for (recNr=1; recNr<=numRecs; recNr++) {
printf "record%d ", recNr
for (fldNr=1;fldNr<=numFlds;fldNr++) {
printf " %4s", ((recNr,fldNr) in recNrFldNr2val ? recNrFldNr2val[recNr,fldNr] : "NULL")
}
print ""
}
}

Related

Subtract fixed number of days from date column using awk and add it to new column

Let's assume that we have a file with the values as seen bellow:
% head test.csv
20220601,A,B,1
20220530,A,B,1
And we want to add two new columns, one with the date minus 1 day and one with minus 7 days, resulting the following:
% head new_test.csv
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
The awk that was used to produce the above is:
% awk 'BEGIN{FS=OFS=","} { a="date -d \"$(date -d \""$1"\") -7 days\" +'%Y%m%d'"; a | getline st ; close(a) ;b="date -d \"$(date -d \""$1"\") -1 days\" +'%Y%m%d'"; b | getline cb ; close(b) ;print $1","$2","$3","st","cb","$4}' test.csv > new_test.csv
But after applying the above in a large file with more than 100K lines it runs for 20 minutes, is there any way to optimize the awk?
One GNU awk approach:
awk '
BEGIN { FS=OFS=","
secs_in_day = 60 * 60 * 24
}
{ dt = mktime( substr($1,1,4) " " substr($1,5,2) " " substr($1,7,2) " 12 0 0" )
dt1 = strftime("%Y%m%d",dt - secs_in_day )
dt7 = strftime("%Y%m%d",dt - (secs_in_day * 7) )
print $1,$2,$3,dt7,dt1,$4
}
' test.csv
This generates:
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
NOTES:
requires GNU awk for the mktime() and strftime() functions; see GNU awk time functions for more details
other flavors of awk may have similar functions, ymmv
You can try using function calls, it is faster than calling the .
awk -F, '
function cmd1(date){
a="date -d \"$(date -d \""date"\") -1days\" +'%Y%m%d'"
a | getline st
return st
close(a)
}
function cmd2(date){
b="date -d \"$(date -d \""date"\") -7days\" +'%Y%m%d'"
b | getline cm
return cm
close(b)
}
{
$5=cmd1($1)
$6=cmd2($1)
print $1","$2","$3","$5","$6","$4
}' OFS=, test > newFileTest
I executed this against a file with 20000 records in seconds, compared to the original awk which took around 5 minutes.

GAWK script- Skip rows $6 OR $7 if = "" and STD calculation

I have put togheter in an awk script, the average and standard deviation from 2 columns (weight,height) and the % of olympic medals achieved (sum and group by male/female).
I only lack the std calculation as of right now.
Sometimes either the weight or the height fields have empty values. I would like to skip them if one of them is empty, to not affect the calculations.
$6=height,$7=weight
Header should be:
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
Imput data to test it:
id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,
The script is :
BEGIN { FS="," }
NR>1 { medals_all+= ($9 + $10 + $11) # sum of ALL medals
if ($3 != country) next # if not the country of interest then go to next record
found_country=1
counts[$4]++ # count of athletes by sex
height_sum[$4]+= $6 # sum of heights by sex
weight_sum[$4]+= $7 # sum of weights by sex
medals_sum[$4]+= ($9 + $10 + $11) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
I was thinking of something like:
if ($6 | $7 = "" ) next
But it gives me an error and I don't know where to put it (After END block or before?)
One awk idea (sans the code for std deviation)
$ cat athletes.awk
BEGIN { FS="," }
NR>1 { medals_all+= ($9 + $10 + $11) # sum of ALL medals
if ($3 != country) next # if not the country of interest then go to next record
found_country=1
counts[$4]++ # count of athletes by sex
height_sum[$4]+= $6 # sum of heights by sex
weight_sum[$4]+= $7 # sum of weights by sex
medals_sum[$4]+= ($9 + $10 + $11) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
Testing the script:
$ awk -v country=USA -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%
$ awk -v country=ESP -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%
$ awk -v country=ETH -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%
$ awk -v country=XXX -f athletes.awk athletesv2.csv
Sorry, country "XXX" not found.
This doesn't attempt to do the std deviation calculation and idk how you're getting those medal percent numbers in your expected output but it should be easy for you to tweak this to finish off whatever it is you need to do:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
$(f["nationality"]) == country_code {
sex = $(f["sex"])
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444
$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333
$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111
Here's another possible interpretation of how to calculate average medals given your comment below but it still doesn't produce the output you want so I guess you mean something different again:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
{
sex = $(f["sex"])
numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += numMedals
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636
A single unified solution that no longer requires the user to manually enter nations one at a time :
the PROCINFO[ ] bit is for swapping from mawk to gawk
while retaining some semblance of sorted order
==
< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
function sum(_,__,___) {
if(+__<-__) {
return sprintf(OFMT,$_)
}
___=""
for(_;_<=__;_+=3) {
___+=$_ }
return +___
}
function mean(_,__) {
return \
sprintf(OFMT,
(+__<-__ ? +$_ :2/__*sum(_,__))\
/(100.0^(_==1)))
}
function sd(_,__,___,____,_____) {
if(+__<-__) {
return "0.0000000"
}
____=""
_____=100^(_==1)
for(_;_<=+__;_+=3) {
____+=(($_)/_____-___)^2
}
return (____/(__/(_=2)))^_--^-_
}
function printreport(_,__,___) {
___=""
print substr(_,__~__,index(_,"=")-1),
substr(_, index(_,"=")+(_=1)),
___=mean(_,__),sd(_++,__,___),
___=mean(_,__),sd(_++,__,___),
sprintf("%8.4f-%%",sum(_,__)*100/_______)
}
BEGIN { _ = ""
PROCINFO[ "sorted_in" \
] = "#ind_str_asc";
___ = 3
______ = " Country,Gender,Weight_avg,Weight_std"\
",Height_avg,Height_std,%-Medals"
SUBSEP = "="
OFS = FS = ","
getline } { sub("$",sprintf("=%.f=%.f=%.f", \
int(100*$6),$7,-_\
+(_+=+$9+$10+$11)),
_____[____[$___]=$___,$4])
} END {
_______ = +_
___ = 3
FS = SUBSEP
CONVFMT = OFMT ="%13.7f"
for(_ in ____) {
printf("%s%s%s",ORS,______,ORS)
for(__ in _____) {
if(index(__,_)) {
$+FS=substr(_____[__],—-___)
printreport(__,(-!!___)^(NF==++___)*NF)
}
}
}
}' | column -s',' -t | column -t | lgp3 3
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ESP female 1.8000000 0.0000000 62.0000000 0.0000000 6.2500-%
ESP male 1.1566667 0.4723660 43.6666667 17.8688639 18.7500-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ETH female 1.6500000 0.0000000 54.0000000 0.0000000 6.2500-%
ETH male 1.7000000 0.0000000 63.0000000 0.0000000 25.0000-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
USA female 1.1300000 0.4665119 39.0000000 17.7106371 18.7500-%
USA male 1.3400000 0.5476008 59.0000000 25.3048085 25.0000-%

Complex CSV parsing with Linux commands

I have a CSV log file that records the properties HA;HB;HC;HD;HE. The following file records 6 entries (separated by the above header).
I would like to extract the 3rd property(HC) of every entry.
HA;HB;HC;HD;HE
a1;b1;14;d;e
HA;HB;HC;HD;HE
a2;b2;28;d;e
HA;HB;HC;HD;HE
a31;b31;44;d;e
a32;b32;07;d;e
HA;HB;HC;HD;HE
a4;b4;0;d;e
HA;HB;HC;HD;HE
a51;b51;32;d;e
a52;b52;0;d;e
a53;b53;5;d;e
HA;HB;HC;HD;HE
a6;b6;10;d;e
Whenever there's n lines of HC recorded per entry, I want to extract the addition of the n entries.
The expected output for the above file:
14
28
51
0
37
10
I know I can write a program for this, but is there an easy way to get this with a combination on awk and/or sed commands?
I haven't tested this; try it and let me know if it works.
awk -F';' '
$3 == "HC" {
if (NR > 1) {
print sum
sum = 0 }
next }
{ sum += $3 }
END { print sum }'
awk solution:
$ awk -F';' '$3=="HC" && p{
print sum # print current total
sum=p=0 # reinitialize sum and p
next
}
$3!="HC"{
sum=sum+($3+0) # make sure $3 is converted to integer. sum it up.
p=1 # set p to 1
} # print last sum
END{print sum}' input.txt
output:
14
28
51
0
37
10
one-liner:
$ awk -F";" '$3=="HC" && p{print sum;sum=p=0;next} $3!="HC"{sum=sum+($3+0);p=1} END{print sum}' input.txt
awk -F';' '/^H.*/{if(f)print s;s=0;f=$3=="HC"}f{s+=$3}END{if(f)print s}' infile
For given inputs:
$ cat infile
HA;HB;HC;HD;HE
a1;b1;14;d;e
HA;HB;HC;HD;HE
a2;b2;28;d;e
HA;HB;HC;HD;HE
a31;b31;44;d;e
a32;b32;07;d;e
HA;HB;HC;HD;HE
a4;b4;0;d;e
HA;HB;HC;HD;HE
a51;b51;32;d;e
a52;b52;0;d;e
a53;b53;5;d;e
HA;HB;HC;HD;HE
a6;b6;10;d;e
$ awk -F';' '/^H.*/{if(f)print s; s=0; f=$3=="HC"}f{s+=$3}END{if(f)print s}' infile
14
28
51
0
37
10
It takes little more care for example:
$ cat infile2
HA;HB;HC;HD;HE
a1;b1;14;d;e
HA;HB;HC;HD;HE
a2;b2;28;d;e
HA;HB;HC;HD;HE
a31;b31;44;d;e
a32;b32;07;d;e
HA;HB;HC;HD;HE
a4;b4;0;d;e
HA;HB;HD;HD;HE <---- Say if HC does not found
a51;b51;32;d;e
a52;b52;0;d;e
a53;b53;5;d;e
HA;HB;HC;HD;HE
a6;b6;10;d;e
# find only HC in 3rd column
$ awk -F';' '/^H.*/{if(f)print s; s=0; f=$3=="HC"}f{s+=$3}END{if(f)print s}' infile2
14
28
51
0
10
# Find HD in 3rd column
$ awk -F';' '/^H.*/{if(f)print s; s=0; f=$3=="HD"}f{s+=$3}END{if(f)print s}' infile2
37
eval "true || $(cat data.csv|cut -d ";" -f3 |sed -e s/"HC"/"0; expr 0"/g |tr '\n' '#'|sed -e s/"##"/""/g|sed -e s/"#"/" + "/g)"
Explanation:
Get contents of the file using cat
Take only the third column using cut delimiter of ;
Replace HC lines with 0; expr 0 values to start building eval-worthy bash expressions to eventually yield expr 0 + 14;
Replace \n newlines temporarily with # to circumvent possible BSD sed limitations
Replace double ## with single # to avoid blank lines turning into spaces and causing expr to bomb out.
Replace # with + to add the numbers together.
Execute the command, but with a true || 0; expr ... to avoid a guaranteed syntax error on the first line.
Which creates this:
true || 0; expr 0 + 14 + 0; expr 0 + 28 + 0; expr 0 + 44 + 07 + 0; expr 0 + 0 + 0; expr 0 + 32 + 0 + 5 + 0; expr 0 + 10
The output looks like this:
14
28
51
0
37
10
This was tested on Bash 3.2 and MacOS El Capitan.
Could you please try following and let me know if this helps you.
awk -F";" '
/^H/ && $3!="HC"{
flag="";
next
}
/^H/ && $3=="HC"{
if(NR>1){
printf("%d\n",sum)
};
sum=0;
flag=1;
next
}
flag{
sum+=$3
}
END{
printf("%d\n",sum)
}
' Input_file
Output will be as follows.
14
28
51
0
37
10
$ awk -F';' '$3=="HC"{if (NR>1) print s; s=0; next} {s+=$3} END{print s}' file
14
28
51
0
37
10

removing duplicates from a file from starting point of a file

input file
a
b
c
d
a
b
c
f
e
I tried below options:
awk '!x[$0]++' file.txt > file.txt.tmp && mv file.txt.tmp file.txt
perl -ne 'print unless $dup{$_}++;' file.txt > file.txt.tmp && mv file.txt.tmp file.txt
awk '{if (++dup[$0] == 1) print $0;}' file.txt > file.txt.tmp && mv file.txt.tmp file.txt
But it removes duplicates and gives output like below:
a
b
c
d
f
e
But we need output something like below.
Output file
d
a
b
c
f
e
I got the answer as below.
awk -F'|' '{k=$1 FS $2} NR==FNR {A[k]=NR; next} A[k]==FNR' file.txt file.txt

Using AWK to get text and looping in csv

i'am new in awk and i want ask...
i have a csv file like this
IVALSTART IVALEND IVALDATE
23:00:00 23:30:00 4/9/2012
STATUS LSN LOC
K lskpg 1201
K lntrjkt 1201
K lbkkstp 1211
and i want to change like this
IVALSTART IVALEND
23:00:00 23:30:00
STATUS LSN LOC IVALDATE
K lskpg 1201 4/9/2012
K lntrjkt 1201 4/9/2012
K lbkkstp 1211 4/9/2012
How to do it in awk?
thanks and best regards!
Try this:
awk '
NR == 1 { name = $3; print $1, $2 }
NR == 2 { date = $3; print $1, $2 }
NR == 3 { print "" }
NR == 4 { $4 = name; print }
NR > 4 { $4 = date; print }
' FILE
If you need formating, it's necessary to change print to printf with appropriate specifiers.