removing duplicates from a file from starting point of a file - duplicates

input file
a
b
c
d
a
b
c
f
e
I tried below options:
awk '!x[$0]++' file.txt > file.txt.tmp && mv file.txt.tmp file.txt
perl -ne 'print unless $dup{$_}++;' file.txt > file.txt.tmp && mv file.txt.tmp file.txt
awk '{if (++dup[$0] == 1) print $0;}' file.txt > file.txt.tmp && mv file.txt.tmp file.txt
But it removes duplicates and gives output like below:
a
b
c
d
f
e
But we need output something like below.
Output file
d
a
b
c
f
e

I got the answer as below.
awk -F'|' '{k=$1 FS $2} NR==FNR {A[k]=NR; next} A[k]==FNR' file.txt file.txt

Related

Subtract fixed number of days from date column using awk and add it to new column

Let's assume that we have a file with the values as seen bellow:
% head test.csv
20220601,A,B,1
20220530,A,B,1
And we want to add two new columns, one with the date minus 1 day and one with minus 7 days, resulting the following:
% head new_test.csv
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
The awk that was used to produce the above is:
% awk 'BEGIN{FS=OFS=","} { a="date -d \"$(date -d \""$1"\") -7 days\" +'%Y%m%d'"; a | getline st ; close(a) ;b="date -d \"$(date -d \""$1"\") -1 days\" +'%Y%m%d'"; b | getline cb ; close(b) ;print $1","$2","$3","st","cb","$4}' test.csv > new_test.csv
But after applying the above in a large file with more than 100K lines it runs for 20 minutes, is there any way to optimize the awk?
One GNU awk approach:
awk '
BEGIN { FS=OFS=","
secs_in_day = 60 * 60 * 24
}
{ dt = mktime( substr($1,1,4) " " substr($1,5,2) " " substr($1,7,2) " 12 0 0" )
dt1 = strftime("%Y%m%d",dt - secs_in_day )
dt7 = strftime("%Y%m%d",dt - (secs_in_day * 7) )
print $1,$2,$3,dt7,dt1,$4
}
' test.csv
This generates:
20220601,A,B,20220525,20220531,1
20220530,A,B,20220523,20220529,1
NOTES:
requires GNU awk for the mktime() and strftime() functions; see GNU awk time functions for more details
other flavors of awk may have similar functions, ymmv
You can try using function calls, it is faster than calling the .
awk -F, '
function cmd1(date){
a="date -d \"$(date -d \""date"\") -1days\" +'%Y%m%d'"
a | getline st
return st
close(a)
}
function cmd2(date){
b="date -d \"$(date -d \""date"\") -7days\" +'%Y%m%d'"
b | getline cm
return cm
close(b)
}
{
$5=cmd1($1)
$6=cmd2($1)
print $1","$2","$3","$5","$6","$4
}' OFS=, test > newFileTest
I executed this against a file with 20000 records in seconds, compared to the original awk which took around 5 minutes.

How to use grep command inside Tcl script

How to run simple grep command in a tcl script and get output
grep B file1 > temp # bash grep command need to execute inside tcl commad,
file1 looks like this:
1 2 3 6 180.00 B
1 2 3 6 F
2 3 6 23 50.00 B
2 3 6 23 F
these do not work
exec grep B file.txt > temp
child process exited abnormally
exec "grep B pes_test.com > temp1"
couldn't execute "grep -e B ./pes_test.com > temp1": no such file or directory
exec /bin/sh -c {grep -e B ; true} < pes_test.com > tmp1
works but do not gives output,
exec throws an error when the process returns non-zero. See exec and the Tcl wiki
try {
set result [exec grep $pattern $file]
} on error {e} {
# typically, pattern not found
set result ""
}
Ref: try man page

awk delete field if other column match

i have a CSV file that looks like this:
col1,col2,col3,col4
1,2,a,01/01
2,3,b,01/02
3,4,5,c,01/03
2,5,6,c,01/03
The last 2 rows have been appended to the file but it has an extra column (third column). I want to delete the third column from the last 2 rows (i.e. where column 4 == "c" and column 5 == "01/03")
Output i want is to remove the third column from last 2 rows such that it has only 4 columns:
col1,col2,col3,col4
1,2,a,01/01
2,3,b,01/02
3,4,c,01/03
2,5,c,01/03
if can be done in vim, would be good too
Here's a slightly different approach that avoids having to type the list of columns to be included:
awk -F, 'BEGIN {OFS=FS} NF==5 {for(i=3;i<=NF;i++){$i=$(i+1)}; NF--} 1'
The solution with an explicit listing of columns can also be written more compactly as follows:
awk -F, 'BEGIN {OFS=FS} NF == 5 {print $1, $2, $4, $5; next} 1'
This should do it
awk -F, 'BEGIN {OFS=","} {if (NF == 5) {print $1, $2, $4, $5} else {print}}' filename
$ awk 'BEGIN{FS=OFS=","} {print $1,$2,$(NF-1),$NF}' file
col1,col2,col3,col4
1,2,a,01/01
2,3,b,01/02
3,4,c,01/03
2,5,c,01/03

Paste columns of different MySQL tables into an ascii file

I have hundreds of MySQL tables, and I would like to create an ascii table with the first column of each MySQL table.
From MySQL tables
table A table B ... table Z
A1 A2 A3 B1 B2 B3 Z1 Z2 Z3
A1 A2 A3 B1 B2 B3 Z1 Z2 Z3
A1 A2 A3 B1 B2 B3 Z1 Z2 Z3
to an ascii file
A1 B1 ... Z1
A1 B1 ... Z1
A1 B1 ... Z1
Which is the faster method?
The tables are hundreds, the columns have thousands of lines, and most of all the columns have the same number of lines (so I don't think that a "join" before the export is necessary)
Thanks a lot
I am not a bash expert but I do things step wise (Other experts can edit and improve my answer)
#!/bin/bash
mysql -uusrname -ppassword -Ddatabasename -s -e 'show tables'
count=0
while read tablenames
do
fieldname=$(mysql -uusrname -ppassword -Ddatabasename -s -e "desc $tablename" | head -2 | tail -1 | awk -F " " '{print $1}');
echo "$tablenames,$fieldname" >> tempfile
done< $(mysql -uusrname -ppassword -Ddatabasename -s -e 'show tables')
lastcount=$(wc -l tempfile)
while read line
do
count=$(($count + 1))
if [ $count -eq 1 ]
then
echo "nothing to do"
elif [ $count -lt $lastcount ]
then
echo $line | awk -F "," '{print "(select "$2 "from" $1")"}' >> sqlcommand.sql
echo "union" >> sqlcommand.sql
else
echo $line | awk -F "," '{print "(select "$2 "from" $1");"}' >> sqlcommand.sql
done < tempfile
mysql -uusrname -ppassword -Ddatabasename << sqlcommand.sql >> outputasciifile.txt

Dynamically edit lists within a csv file

I have a csv file that looks like that:
col1|col2
1|a
2|g
3|f
1|m
3|k
2|n
2|a
1|d
4|r
3|s
where | separates the columns, and would like to transform it into something homogeneous like:
------------------------
fields > 1 2 3 4
record1 a g f
record2 m n k
record3 d a s r
------------------------
Is there a way to do that? What would be better, using mysql or editing the csv file?
I wrote this, works for your example: gawk is required
awk -F'|' -v RS="" '{for(i=1;i<=NF;i+=2)a[$i]=$(i+1);asorti(a,d);
for(i=1;i<=length(a);i++)printf "%s", a[d[i]]((i==length(a))?"":" ");delete a;delete d;print ""}' file
example:
kent$ cat file
1|a
2|g
3|f
1|m
3|k
2|n
2|a
1|d
4|r
3|s
kent$ awk -F'|' -v RS="" '{for(i=1;i<=NF;i+=2)a[$i]=$(i+1);asorti(a,d);
for(i=1;i<=length(a);i++)printf "%s", a[d[i]]((i==length(a))?"":" ");delete a;delete d;print ""}' file
a g f
m n k
d a s r
Here an awk solution:
BEGIN{
RS=""
FS="\n"
}
FNR==NR&&FNR>1{
for (i=1;i<=NF;i++) {
split($i,d,"|")
if (d[1] > max)
max = d[1]
}
next
}
FNR>1&&!header{
printf "%s\t","fields >"
for (i=1;i<=max;i++)
printf "%s\t",i
print ""
header=1
}
FNR>1{
printf "record%s\t\t",FNR-1
for (i=1;i<=NF;i++) {
split($i,d,"|")
val[d[1]] = d[2]
}
for (i=1;i<=max;i++)
printf "%s\t",val[i]?val[i]:"NULL"
print ""
delete val
}
Save as script.awk and run like (notice it uses a two pass approach so you need to give the file twice):
$ awk -f script.awk file file
fields > 1 2 3 4
record1 a g f NULL
record2 m n k NULL
record3 d a s r
Adding the line 5|b to the first record in file gives the output:
$ awk -f script.awk file file
fields > 1 2 3 4 5
record1 a g f NULL b
record2 m n k NULL NULL
record3 d a s r NULL
$ cat file
col1|col2
1|a
2|g
3|f
5|b
1|m
3|k
2|n
2|a
1|d
4|r
3|s
$
$ awk -f tst.awk file
fields > 1 2 3 4 5
record1 a g f NULL b
record2 m n k NULL NULL
record3 d a s r NULL
$
$ cat tst.awk
BEGIN{ RS=""; FS="\n" }
NR>1 {
++numRecs
for (i=1;i<=NF;i++) {
split($i,fldNr2val,"|")
fldNr = fldNr2val[1]
val = fldNr2val[2]
recNrFldNr2val[numRecs,fldNr] = val
numFlds = (fldNr > numFlds ? fldNr : numFlds)
}
}
END {
printf "fields >"
for (fldNr=1;fldNr<=numFlds;fldNr++) {
printf " %4s", fldNr
}
print ""
for (recNr=1; recNr<=numRecs; recNr++) {
printf "record%d ", recNr
for (fldNr=1;fldNr<=numFlds;fldNr++) {
printf " %4s", ((recNr,fldNr) in recNrFldNr2val ? recNrFldNr2val[recNr,fldNr] : "NULL")
}
print ""
}
}