Skip to main content

awk

Intro

Along with cut, we can use awk with more flexible features.

Syntax: awk [option] 'pattern or keyword { actions }' file

Basic usage

awk
$ echo a b c | awk '{print $1 $3}'
ac

$ echo a b c | awk '{print $1, $3}'
a c

$ df / | awk 'FNR>1 {print $1, $5}' # remove first line
/dev/sda1 25763712 22758256 1919696 93% /

$ echo a,b,c | awk -F',' '{print $2}'
b

$ cat server.csv
...
3b56ea2f-03e6-45fb-b0c5-60475f97067e,test-sang,SHUTOFF,92.168.1.62

# we want to transform to this
# test-sang 3b56ea2f-03e6-45fb-b0c5-60475f97067e (SHUTOFF) | 192.168.1.62
$ awk -F',' '{print $2, $1, "(" $3 ")", "|", $4 }' server.csv
test-thinhdp 2a304fe9-82cc-4948-ae1b-c31e07c9e40b (ACTIVE) | 192.168.1.29
test-tritn 990fdc8d-08fe-47da-a555-9576940a4846 (ACTIVE) | 192.168.1.28
test-sang 3b56ea2f-03e6-45fb-b0c5-60475f97067e (SHUTOFF) | 92.168.1.62
-F as separator
, (command) will create a space
double quote on special character

$ awk -F',' '/sang/{print $2, $1, "(" $3 ")", "|", $4 }' server.csv
test-sang 3b56ea2f-03e6-45fb-b0c5-60475f97067e (SHUTOFF) | 92.168.1.62

$ awk -F',' '$3~/SHUTOFF/{print $2, $1, "(" $3 ")", "|", $4 }' server.csv
# $3 has value SHUTOFF
test-sang 3b56ea2f-03e6-45fb-b0c5-60475f97067e (SHUTOFF) | 92.168.1.62

$ awk -F',' \
> 'BEGIN {print "List SHUTOFF servers:"} \
> $3~/SHUTOFF/{print "+", $2, $1, "(" $3 ")", "|", $4 } \
> END {print "==="}' server.csv
List SHUTOFF servers:
+ test-sang 3b56ea2f-03e6-45fb-b0c5-60475f97067e (SHUTOFF) | 92.168.1.62

# we can perform calculation
$ seq 1 100 | awk '{s+=$1} END {print s}'
5050
===

Find duplicate files

Imagine we have a directory with lots of images and we want to filter if any of them are duplicated.

awk-duplicate-files
# Example from `Efficient Linux at the Command Line, Daniel J. Barrett` ch05
$ ls -l
total 160
-rw-rw-r-- 1 stackops stackops 5362 Apr 21 11:30 image001.jpg
-rw-rw-r-- 1 stackops stackops 5431 Apr 21 11:30 image002.jpg
-rw-rw-r-- 1 stackops stackops 5362 Apr 21 11:30 image003.jpg
-rw-rw-r-- 1 stackops stackops 5436 Apr 21 11:30 image004.jpg
-rw-rw-r-- 1 stackops stackops 5510 Apr 21 11:30 image005.jpg
-rw-rw-r-- 1 stackops stackops 5415 Apr 21 11:30 image006.jpg
-rw-rw-r-- 1 stackops stackops 5474 Apr 21 11:30 image007.jpg
-rw-rw-r-- 1 stackops stackops 5447 Apr 21 11:30 image008.jpg
-rw-rw-r-- 1 stackops stackops 5502 Apr 21 11:30 image009.jpg
-rw-rw-r-- 1 stackops stackops 5659 Apr 21 11:30 image010.jpg
-rw-rw-r-- 1 stackops stackops 5634 Apr 21 11:30 image011.jpg
-rw-rw-r-- 1 stackops stackops 5474 Apr 21 11:30 image012.jpg
-rw-rw-r-- 1 stackops stackops 5521 Apr 21 11:30 image013.jpg
-rw-rw-r-- 1 stackops stackops 5474 Apr 21 11:30 image014.jpg
-rw-rw-r-- 1 stackops stackops 5539 Apr 21 11:30 image015.jpg
-rw-rw-r-- 1 stackops stackops 5528 Apr 21 11:30 image016.jpg
-rw-rw-r-- 1 stackops stackops 5480 Apr 21 11:30 image017.jpg
-rw-rw-r-- 1 stackops stackops 5466 Apr 21 11:30 image018.jpg
-rw-rw-r-- 1 stackops stackops 5510 Apr 21 11:30 image019.jpg
-rw-rw-r-- 1 stackops stackops 5510 Apr 21 11:30 image020.jpg

$ md5sum *
146b163929b6533f02e91bdf21cb9563 image001.jpg
63da88b3ddde0843c94269638dfa6958 image002.jpg
146b163929b6533f02e91bdf21cb9563 image003.jpg
17f339ed03733f402f74cf386209aeb3 image004.jpg
381ebc2cd3aab91a65492ef360714e2c image005.jpg
1aa30608eb268a45266403f177f214d0 image006.jpg
f6464ed766daca87ba407aede21c8fcc image007.jpg
b965d5f3463e41eb66ea87a2933c407c image008.jpg
8f8d01a6598833fb04abc69fe9e9572c image009.jpg
bc64c99757e199ef858f52acf7e4e836 image010.jpg
714eceeb06b43c03fe20eb96474f69b8 image011.jpg
f6464ed766daca87ba407aede21c8fcc image012.jpg
3825f1cffa61aee4673f5b7c535b2a09 image013.jpg
f6464ed766daca87ba407aede21c8fcc image014.jpg
1258012d57050ef6005739d0e6f6a257 image015.jpg
c96a1094226ad766fc9367f508dc9b32 image016.jpg
bef69f30a2f88a20e81797ea65c1e082 image017.jpg
d8ad913044a51408ec1ed8a204ea9502 image018.jpg
c7978522c58425f6af3f095ef1de1cd5 image019.jpg
c7978522c58425f6af3f095ef1de1cd5 image020.jpg

# add the array with array_key[]
# c[$1] will take $1 as key and increase
# n[$1]=n[$1] " " $2, mean append $2 to the current
# loop structure: for (key in array) action
$ md5sum *jpg | awk \
'{ c[$1]++; n[$1]=n[$1] " " $2 } \
END { for (key in c) print c[key], key, n[key]}' | grep -Ev ^1 | sort -nr
3 f6464ed766daca87ba407aede21c8fcc image007.jpg image012.jpg image014.jpg
2 c7978522c58425f6af3f095ef1de1cd5 image019.jpg image020.jpg
2 146b163929b6533f02e91bdf21cb9563 image001.jpg image003.jpg