I have put togheter in an awk script, the average and standard deviation from 2 columns (weight,height) and the % of olympic medals achieved (sum and group by male/female).
I only lack the std calculation as of right now.
Sometimes either the weight or the height fields have empty values. I would like to skip them if one of them is empty, to not affect the calculations.
$6=height,$7=weight
Header should be:
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
Imput data to test it:
id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,
The script is :
BEGIN { FS="," }
NR>1 { medals_all+= ($9 + $10 + $11) # sum of ALL medals
if ($3 != country) next # if not the country of interest then go to next record
found_country=1
counts[$4]++ # count of athletes by sex
height_sum[$4]+= $6 # sum of heights by sex
weight_sum[$4]+= $7 # sum of weights by sex
medals_sum[$4]+= ($9 + $10 + $11) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
I was thinking of something like:
if ($6 | $7 = "" ) next
But it gives me an error and I don't know where to put it (After END block or before?)
One awk idea (sans the code for std deviation)
$ cat athletes.awk
BEGIN { FS="," }
NR>1 { medals_all+= ($9 + $10 + $11) # sum of ALL medals
if ($3 != country) next # if not the country of interest then go to next record
found_country=1
counts[$4]++ # count of athletes by sex
height_sum[$4]+= $6 # sum of heights by sex
weight_sum[$4]+= $7 # sum of weights by sex
medals_sum[$4]+= ($9 + $10 + $11) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
Testing the script:
$ awk -v country=USA -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%
$ awk -v country=ESP -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%
$ awk -v country=ETH -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%
$ awk -v country=XXX -f athletes.awk athletesv2.csv
Sorry, country "XXX" not found.
This doesn't attempt to do the std deviation calculation and idk how you're getting those medal percent numbers in your expected output but it should be easy for you to tweak this to finish off whatever it is you need to do:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
$(f["nationality"]) == country_code {
sex = $(f["sex"])
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444
$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333
$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111
Here's another possible interpretation of how to calculate average medals given your comment below but it still doesn't produce the output you want so I guess you mean something different again:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
{
sex = $(f["sex"])
numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += numMedals
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636
A single unified solution that no longer requires the user to manually enter nations one at a time :
the PROCINFO[ ] bit is for swapping from mawk to gawk
while retaining some semblance of sorted order
==
< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
function sum(_,__,___) {
if(+__<-__) {
return sprintf(OFMT,$_)
}
___=""
for(_;_<=__;_+=3) {
___+=$_ }
return +___
}
function mean(_,__) {
return \
sprintf(OFMT,
(+__<-__ ? +$_ :2/__*sum(_,__))\
/(100.0^(_==1)))
}
function sd(_,__,___,____,_____) {
if(+__<-__) {
return "0.0000000"
}
____=""
_____=100^(_==1)
for(_;_<=+__;_+=3) {
____+=(($_)/_____-___)^2
}
return (____/(__/(_=2)))^_--^-_
}
function printreport(_,__,___) {
___=""
print substr(_,__~__,index(_,"=")-1),
substr(_, index(_,"=")+(_=1)),
___=mean(_,__),sd(_++,__,___),
___=mean(_,__),sd(_++,__,___),
sprintf("%8.4f-%%",sum(_,__)*100/_______)
}
BEGIN { _ = ""
PROCINFO[ "sorted_in" \
] = "#ind_str_asc";
___ = 3
______ = " Country,Gender,Weight_avg,Weight_std"\
",Height_avg,Height_std,%-Medals"
SUBSEP = "="
OFS = FS = ","
getline } { sub("$",sprintf("=%.f=%.f=%.f", \
int(100*$6),$7,-_\
+(_+=+$9+$10+$11)),
_____[____[$___]=$___,$4])
} END {
_______ = +_
___ = 3
FS = SUBSEP
CONVFMT = OFMT ="%13.7f"
for(_ in ____) {
printf("%s%s%s",ORS,______,ORS)
for(__ in _____) {
if(index(__,_)) {
$+FS=substr(_____[__],—-___)
printreport(__,(-!!___)^(NF==++___)*NF)
}
}
}
}' | column -s',' -t | column -t | lgp3 3
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ESP female 1.8000000 0.0000000 62.0000000 0.0000000 6.2500-%
ESP male 1.1566667 0.4723660 43.6666667 17.8688639 18.7500-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ETH female 1.6500000 0.0000000 54.0000000 0.0000000 6.2500-%
ETH male 1.7000000 0.0000000 63.0000000 0.0000000 25.0000-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
USA female 1.1300000 0.4665119 39.0000000 17.7106371 18.7500-%
USA male 1.3400000 0.5476008 59.0000000 25.3048085 25.0000-%
Related
Let's assume that we have a file with the values as seen bellow:
% head test.csv
20220601,A,B,1
20220530,A,B,1
And we want to add two new columns, one with the date minus 1 day and one with minus 7 days, resulting the following:
% head new_test.csv
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
The awk that was used to produce the above is:
% awk 'BEGIN{FS=OFS=","} { a="date -d \"$(date -d \""$1"\") -7 days\" +'%Y%m%d'"; a | getline st ; close(a) ;b="date -d \"$(date -d \""$1"\") -1 days\" +'%Y%m%d'"; b | getline cb ; close(b) ;print $1","$2","$3","st","cb","$4}' test.csv > new_test.csv
But after applying the above in a large file with more than 100K lines it runs for 20 minutes, is there any way to optimize the awk?
One GNU awk approach:
awk '
BEGIN { FS=OFS=","
secs_in_day = 60 * 60 * 24
}
{ dt = mktime( substr($1,1,4) " " substr($1,5,2) " " substr($1,7,2) " 12 0 0" )
dt1 = strftime("%Y%m%d",dt - secs_in_day )
dt7 = strftime("%Y%m%d",dt - (secs_in_day * 7) )
print $1,$2,$3,dt7,dt1,$4
}
' test.csv
This generates:
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
NOTES:
requires GNU awk for the mktime() and strftime() functions; see GNU awk time functions for more details
other flavors of awk may have similar functions, ymmv
You can try using function calls, it is faster than calling the .
awk -F, '
function cmd1(date){
a="date -d \"$(date -d \""date"\") -1days\" +'%Y%m%d'"
a | getline st
return st
close(a)
}
function cmd2(date){
b="date -d \"$(date -d \""date"\") -7days\" +'%Y%m%d'"
b | getline cm
return cm
close(b)
}
{
$5=cmd1($1)
$6=cmd2($1)
print $1","$2","$3","$5","$6","$4
}' OFS=, test > newFileTest
I executed this against a file with 20000 records in seconds, compared to the original awk which took around 5 minutes.
With the aim to perform some statistical analysis of multi-column data I am analyzing big number of CSV filles using the following bash + AWK routine:
#!/bin/bash
home="$PWD"
# folder with the outputs
rescore="${home}"/rescore
# folder with the folders to analyse
storage="${home}"/results
#cd "${home}"/results
cd ${storage}
csv_pattern='*_filt.csv'
while read -r d; do
awk -v rescore="$rescore" '
FNR==1 {
if (n)
mean[suffix] = s/n
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix)
sub(/^.*_/, "", suffix)
s=n=0
}
FNR > 1 {
s += $3
++n
}
END {
out = rescore "/" prefix ".csv"
mean[suffix] = s/n
print prefix ":", "dG(mean)" > out
for (i in mean)
printf "%s: %.2f\n", i, mean[i] >> out
close(out)
}' "${d}_"*/${csv_pattern} #> "${rescore}/"${d%%_*}".csv"
done < <(find . -maxdepth 1 -type d -name '*_*_*' | awk -F '[_/]' '!seen[$2]++ {print $2}')
Basically the script takes ensemble of CSV files belonged to the same prefix (defined as the naming pattern occured at the begining of the directory contained CSV, for example 10V1 from 10V1_cne_lig1) and calculate for it the mean value for the numbers in the third column:
# input *_filt.csv located in the folder 10V1_cne_lig1001
ID, POP, dG
1, 142, -5.6500
2, 10, -5.5000
3, 2, -4.9500
add 1 string to 10V1.csv, which is organized in 2 column format i) the name of the suffix of the folder with initial CSV; ii) the mean value calculated for all numbers in the third column (dG) of input.csv:
# this is two column format of output.csv: 10V1.csv
10V1: dG(mean)
lig1001: -5.37
in this way for 100 CSV filles such output.csv should contain 100 lines with the mean values, etc
I need to introduce a small modification to my AWK part of my routine that would add the 3rd column to the output CSV with RMSD value (as the measure of the differences between initial dG values) of the initial data (dG), which had been used to calculate the MEAN value. Using AWK syntax, with a particular MEAN value the RMS could be expressed as
mean=$(awk -F , '{sum+=$3}END{printf "%.2f", sum/NR}' $csv)
rmsd=$(awk -v mean=$mean '{++n;sum+=($NF-mean)^2} END{if(n) printf "%.2f", sqrt(sum/n)}' $csv)
Here is expected output for 5 means and 5 rmsds values calculated for 5 CSV logs (the first one is corresponded to my above example!):
10V1: dG(mean): RMSD (error)
lig1001 -5.37 0.30
lig1002 -8.53 0.34
lig1003 -6.57 0.25
lig1004 -9.53 0.00 # rmsd=0 since initial csv has only 1 line: no data variance
lig1005 -8.11 0.39
How this addition could be incorporated into my main bash-AWK code with the aim to add the third RMSD column (for each of the processed CSV, thus taking each of the calculated MEAN) to the output.csv?
You can calculate both of mean and rmsd within the awk code.
Would you please try the following awk code:
awk -v rescore="$rescore" '
FNR==1 {
if (n) { # calculate the results of previous file
m = s / n # mean
var = s2 / n - m * m # variance
if (var < 0) var = 0 # avoid an exception due to round-off error
mean[suffix] = m # store the mean in an array
rmsd[suffix] = sqrt(var)
}
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix)
sub(/^.*_/, "", suffix)
s = 0 # sum of $3
s2 = 0 # sum of $3 ** 2
n = 0 # count of samples
}
FNR > 1 {
s += $3
s2 += $3 * $3
++n
}
END {
out = rescore "/" prefix ".csv"
m = s / n
var = s2 / n - m * m
if (var < 0) var = 0
mean[suffix] = m
rmsd[suffix] = sqrt(var)
print prefix ":", "dG(mean)", "dG(rmsd)" > out
for (i in mean)
printf "%s: %.2f %.2f\n", i, mean[i], rmsd[i] >> out
close(out)
}'
Here is the version to print the lowest value of dG.
awk -v rescore="$rescore" '
FNR==1 {
if (n) { # calculate the results of previous file
m = s / n # mean
var = s2 / n - m * m # variance
if (var < 0) var = 0 # avoid an exception due to round-off error
mean[suffix] = m # store the mean in an array
rmsd[suffix] = sqrt(var)
lowest[suffix] = min
}
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix)
sub(/^.*_/, "", suffix)
s = 0 # sum of $3
s2 = 0 # sum of $3 ** 2
n = 0 # count of samples
min = 0 # lowest value of $3
}
FNR > 1 {
s += $3
s2 += $3 * $3
++n
if ($3 < min) min = $3 # update the lowest value
}
END {
if (n) { # just to avoid division by zero
m = s / n
var = s2 / n - m * m
if (var < 0) var = 0
mean[suffix] = m
rmsd[suffix] = sqrt(var)
lowest[suffix] = min
}
out = rescore "/" prefix ".csv"
print prefix ":", "dG(mean)", "dG(rmsd)", "dG(lowest)" > out
for (i in mean)
printf "%s: %.2f %.2f %.2f\n", i, mean[i], rmsd[i], lowest[i] > out
}' file_*.csv
I've assumed all dG values are negative. If there is any chance the
value is greater than zero, modify the line min = 0 which initializes
the variable to considerably big value (10,000 or whatever).
Please apply your modifications regarding the filenames, if needed.
The suggestions by Ed Morton are also included although the results will be the same.
I have a csv file that looks like that:
col1|col2
1|a
2|g
3|f
1|m
3|k
2|n
2|a
1|d
4|r
3|s
where | separates the columns, and would like to transform it into something homogeneous like:
------------------------
fields > 1 2 3 4
record1 a g f
record2 m n k
record3 d a s r
------------------------
Is there a way to do that? What would be better, using mysql or editing the csv file?
I wrote this, works for your example: gawk is required
awk -F'|' -v RS="" '{for(i=1;i<=NF;i+=2)a[$i]=$(i+1);asorti(a,d);
for(i=1;i<=length(a);i++)printf "%s", a[d[i]]((i==length(a))?"":" ");delete a;delete d;print ""}' file
example:
kent$ cat file
1|a
2|g
3|f
1|m
3|k
2|n
2|a
1|d
4|r
3|s
kent$ awk -F'|' -v RS="" '{for(i=1;i<=NF;i+=2)a[$i]=$(i+1);asorti(a,d);
for(i=1;i<=length(a);i++)printf "%s", a[d[i]]((i==length(a))?"":" ");delete a;delete d;print ""}' file
a g f
m n k
d a s r
Here an awk solution:
BEGIN{
RS=""
FS="\n"
}
FNR==NR&&FNR>1{
for (i=1;i<=NF;i++) {
split($i,d,"|")
if (d[1] > max)
max = d[1]
}
next
}
FNR>1&&!header{
printf "%s\t","fields >"
for (i=1;i<=max;i++)
printf "%s\t",i
print ""
header=1
}
FNR>1{
printf "record%s\t\t",FNR-1
for (i=1;i<=NF;i++) {
split($i,d,"|")
val[d[1]] = d[2]
}
for (i=1;i<=max;i++)
printf "%s\t",val[i]?val[i]:"NULL"
print ""
delete val
}
Save as script.awk and run like (notice it uses a two pass approach so you need to give the file twice):
$ awk -f script.awk file file
fields > 1 2 3 4
record1 a g f NULL
record2 m n k NULL
record3 d a s r
Adding the line 5|b to the first record in file gives the output:
$ awk -f script.awk file file
fields > 1 2 3 4 5
record1 a g f NULL b
record2 m n k NULL NULL
record3 d a s r NULL
$ cat file
col1|col2
1|a
2|g
3|f
5|b
1|m
3|k
2|n
2|a
1|d
4|r
3|s
$
$ awk -f tst.awk file
fields > 1 2 3 4 5
record1 a g f NULL b
record2 m n k NULL NULL
record3 d a s r NULL
$
$ cat tst.awk
BEGIN{ RS=""; FS="\n" }
NR>1 {
++numRecs
for (i=1;i<=NF;i++) {
split($i,fldNr2val,"|")
fldNr = fldNr2val[1]
val = fldNr2val[2]
recNrFldNr2val[numRecs,fldNr] = val
numFlds = (fldNr > numFlds ? fldNr : numFlds)
}
}
END {
printf "fields >"
for (fldNr=1;fldNr<=numFlds;fldNr++) {
printf " %4s", fldNr
}
print ""
for (recNr=1; recNr<=numRecs; recNr++) {
printf "record%d ", recNr
for (fldNr=1;fldNr<=numFlds;fldNr++) {
printf " %4s", ((recNr,fldNr) in recNrFldNr2val ? recNrFldNr2val[recNr,fldNr] : "NULL")
}
print ""
}
}
i'am new in awk and i want ask...
i have a csv file like this
IVALSTART IVALEND IVALDATE
23:00:00 23:30:00 4/9/2012
STATUS LSN LOC
K lskpg 1201
K lntrjkt 1201
K lbkkstp 1211
and i want to change like this
IVALSTART IVALEND
23:00:00 23:30:00
STATUS LSN LOC IVALDATE
K lskpg 1201 4/9/2012
K lntrjkt 1201 4/9/2012
K lbkkstp 1211 4/9/2012
How to do it in awk?
thanks and best regards!
Try this:
awk '
NR == 1 { name = $3; print $1, $2 }
NR == 2 { date = $3; print $1, $2 }
NR == 3 { print "" }
NR == 4 { $4 = name; print }
NR > 4 { $4 = date; print }
' FILE
If you need formating, it's necessary to change print to printf with appropriate specifiers.
Let say I have 2 fields displaying epoch time in microseconds:
1318044415123456,1318044415990056
What I wanted to do is:
Cut the common part from both fields: "1318044415"
Get the difference of the remaining parts: 990056 - 123456 = 866600
Why am I doing this? Because awk uses floating point IEEE 754 but not 64 bit integers and I need to get difference of epoch time of 2 events in microseconds.
Thanks for any help!
EDIT:
Finally I found the largest number Awk could handle on Snow Leopard 10.6.8: 9007199254740992.
Try this: echo '9007199254740992' | awk -F ',' '{print $1 + 0}'
The version of Awk was 20070501 (produced by awk --version)
Here is an awk script that meets your requirements:
BEGIN {
FS = ","
}
{
s1 = $1
s2 = $2
while (length(s1) > 1 && substr(s1, 1, 1) == substr(s2, 1, 1))
{
s1 = substr(s1, 2)
s2 = substr(s2, 2)
}
n1 = s1 + 0
n2 = s2 + 0
print n2 - n1
}