Let's assume that we have a file with the values as seen bellow:
% head test.csv
20220601,A,B,1
20220530,A,B,1
And we want to add two new columns, one with the date minus 1 day and one with minus 7 days, resulting the following:
% head new_test.csv
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
The awk that was used to produce the above is:
% awk 'BEGIN{FS=OFS=","} { a="date -d \"$(date -d \""$1"\") -7 days\" +'%Y%m%d'"; a | getline st ; close(a) ;b="date -d \"$(date -d \""$1"\") -1 days\" +'%Y%m%d'"; b | getline cb ; close(b) ;print $1","$2","$3","st","cb","$4}' test.csv > new_test.csv
But after applying the above in a large file with more than 100K lines it runs for 20 minutes, is there any way to optimize the awk?
One GNU awk approach:
awk '
BEGIN { FS=OFS=","
secs_in_day = 60 * 60 * 24
}
{ dt = mktime( substr($1,1,4) " " substr($1,5,2) " " substr($1,7,2) " 12 0 0" )
dt1 = strftime("%Y%m%d",dt - secs_in_day )
dt7 = strftime("%Y%m%d",dt - (secs_in_day * 7) )
print $1,$2,$3,dt7,dt1,$4
}
' test.csv
This generates:
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
NOTES:
requires GNU awk for the mktime() and strftime() functions; see GNU awk time functions for more details
other flavors of awk may have similar functions, ymmv
You can try using function calls, it is faster than calling the .
awk -F, '
function cmd1(date){
a="date -d \"$(date -d \""date"\") -1days\" +'%Y%m%d'"
a | getline st
return st
close(a)
}
function cmd2(date){
b="date -d \"$(date -d \""date"\") -7days\" +'%Y%m%d'"
b | getline cm
return cm
close(b)
}
{
$5=cmd1($1)
$6=cmd2($1)
print $1","$2","$3","$5","$6","$4
}' OFS=, test > newFileTest
I executed this against a file with 20000 records in seconds, compared to the original awk which took around 5 minutes.
I have put togheter in an awk script, the average and standard deviation from 2 columns (weight,height) and the % of olympic medals achieved (sum and group by male/female).
I only lack the std calculation as of right now.
Sometimes either the weight or the height fields have empty values. I would like to skip them if one of them is empty, to not affect the calculations.
$6=height,$7=weight
Header should be:
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
Imput data to test it:
id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,
The script is :
BEGIN { FS="," }
NR>1 { medals_all+= ($9 + $10 + $11) # sum of ALL medals
if ($3 != country) next # if not the country of interest then go to next record
found_country=1
counts[$4]++ # count of athletes by sex
height_sum[$4]+= $6 # sum of heights by sex
weight_sum[$4]+= $7 # sum of weights by sex
medals_sum[$4]+= ($9 + $10 + $11) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
I was thinking of something like:
if ($6 | $7 = "" ) next
But it gives me an error and I don't know where to put it (After END block or before?)
One awk idea (sans the code for std deviation)
$ cat athletes.awk
BEGIN { FS="," }
NR>1 { medals_all+= ($9 + $10 + $11) # sum of ALL medals
if ($3 != country) next # if not the country of interest then go to next record
found_country=1
counts[$4]++ # count of athletes by sex
height_sum[$4]+= $6 # sum of heights by sex
weight_sum[$4]+= $7 # sum of weights by sex
medals_sum[$4]+= ($9 + $10 + $11) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
Testing the script:
$ awk -v country=USA -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%
$ awk -v country=ESP -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%
$ awk -v country=ETH -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%
$ awk -v country=XXX -f athletes.awk athletesv2.csv
Sorry, country "XXX" not found.
This doesn't attempt to do the std deviation calculation and idk how you're getting those medal percent numbers in your expected output but it should be easy for you to tweak this to finish off whatever it is you need to do:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
$(f["nationality"]) == country_code {
sex = $(f["sex"])
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444
$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333
$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111
Here's another possible interpretation of how to calculate average medals given your comment below but it still doesn't produce the output you want so I guess you mean something different again:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
{
sex = $(f["sex"])
numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += numMedals
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636
A single unified solution that no longer requires the user to manually enter nations one at a time :
the PROCINFO[ ] bit is for swapping from mawk to gawk
while retaining some semblance of sorted order
==
< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
function sum(_,__,___) {
if(+__<-__) {
return sprintf(OFMT,$_)
}
___=""
for(_;_<=__;_+=3) {
___+=$_ }
return +___
}
function mean(_,__) {
return \
sprintf(OFMT,
(+__<-__ ? +$_ :2/__*sum(_,__))\
/(100.0^(_==1)))
}
function sd(_,__,___,____,_____) {
if(+__<-__) {
return "0.0000000"
}
____=""
_____=100^(_==1)
for(_;_<=+__;_+=3) {
____+=(($_)/_____-___)^2
}
return (____/(__/(_=2)))^_--^-_
}
function printreport(_,__,___) {
___=""
print substr(_,__~__,index(_,"=")-1),
substr(_, index(_,"=")+(_=1)),
___=mean(_,__),sd(_++,__,___),
___=mean(_,__),sd(_++,__,___),
sprintf("%8.4f-%%",sum(_,__)*100/_______)
}
BEGIN { _ = ""
PROCINFO[ "sorted_in" \
] = "#ind_str_asc";
___ = 3
______ = " Country,Gender,Weight_avg,Weight_std"\
",Height_avg,Height_std,%-Medals"
SUBSEP = "="
OFS = FS = ","
getline } { sub("$",sprintf("=%.f=%.f=%.f", \
int(100*$6),$7,-_\
+(_+=+$9+$10+$11)),
_____[____[$___]=$___,$4])
} END {
_______ = +_
___ = 3
FS = SUBSEP
CONVFMT = OFMT ="%13.7f"
for(_ in ____) {
printf("%s%s%s",ORS,______,ORS)
for(__ in _____) {
if(index(__,_)) {
$+FS=substr(_____[__],—-___)
printreport(__,(-!!___)^(NF==++___)*NF)
}
}
}
}' | column -s',' -t | column -t | lgp3 3
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ESP female 1.8000000 0.0000000 62.0000000 0.0000000 6.2500-%
ESP male 1.1566667 0.4723660 43.6666667 17.8688639 18.7500-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ETH female 1.6500000 0.0000000 54.0000000 0.0000000 6.2500-%
ETH male 1.7000000 0.0000000 63.0000000 0.0000000 25.0000-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
USA female 1.1300000 0.4665119 39.0000000 17.7106371 18.7500-%
USA male 1.3400000 0.5476008 59.0000000 25.3048085 25.0000-%