Donbeo Donbeo - 1 month ago 10
Bash Question

how to concatenate zipped csv files without repeating the headers with shell

I have a folder with a lot of

gz
files.

out_batch_100_start_1477252800_end_1477256400_.gz
out_batch_101_start_1477256400_end_1477260000_.gz
out_batch_102_start_1477260000_end_1477263600_.gz
out_batch_103_start_1477263600_end_1477267200_.gz
out_batch_104_start_1477267200_end_1477270800_.gz
out_batch_105_start_1477270800_end_1477274400_.gz
out_batch_106_start_1477274400_end_1477278000_.gz
out_batch_107_start_1477278000_end_1477281600_.gz
out_batch_108_start_1477281600_end_1477285200_.gz
out_batch_109_start_1477285200_end_1477288800_.gz
out_batch_10_start_1476928800_end_1476932400_.gz
out_batch_110_start_1477288800_end_1477292400_.gz
out_batch_111_start_1477292400_end_1477296000_.gz
out_batch_112_start_1477296000_end_1477299600_.gz
out_batch_113_start_1477299600_end_1477303200_.gz
out_batch_114_start_1477303200_end_1477306800_.gz
out_batch_115_start_1477306800_end_1477310400_.gz
out_batch_116_start_1477310400_end_1477314000_.gz
out_batch_117_start_1477314000_end_1477317600_.gz
out_batch_118_start_1477317600_end_1477321200_.gz
out_batch_119_start_1477321200_end_1477324800_.gz
out_batch_11_start_1476932400_end_1476936000_.gz
out_batch_120_start_1477324800_end_1477328400_.gz
out_batch_121_start_1477328400_end_1477332000_.gz
out_batch_122_start_1477332000_end_1477335600_.gz
out_batch_123_start_1477335600_end_1477339200_.gz
out_batch_124_start_1477339200_end_1477342800_.gz
out_batch_125_start_1477342800_end_1477346400_.gz
out_batch_126_start_1477346400_end_1477350000_.gz
out_batch_127_start_1477350000_end_1477353600_.gz
out_batch_128_start_1477353600_end_1477357200_.gz
out_batch_129_start_1477357200_end_1477360800_.gz
out_batch_12_start_1476936000_end_1476939600_.gz
out_batch_130_start_1477360800_end_1477364400_.gz
out_batch_131_start_1477364400_end_1477368000_.gz
out_batch_132_start_1477368000_end_1477371600_.gz
out_batch_133_start_1477371600_end_1477375200_.gz
out_batch_134_start_1477375200_end_1477378800_.gz
out_batch_135_start_1477378800_end_1477382400_.gz
out_batch_136_start_1477382400_end_1477386000_.gz
out_batch_137_start_1477386000_end_1477389600_.gz
out_batch_138_start_1477389600_end_1477393200_.gz
out_batch_139_start_1477393200_end_1477396800_.gz
out_batch_13_start_1476939600_end_1476943200_.gz
out_batch_140_start_1477396800_end_1477400400_.gz
out_batch_141_start_1477400400_end_1477404000_.gz
out_batch_142_start_1477404000_end_1477407600_.gz
out_batch_143_start_1477407600_end_1477411200_.gz
out_batch_144_start_1477411200_end_1477414800_.gz
out_batch_145_start_1477414800_end_1477418400_.gz
out_batch_146_start_1477418400_end_1477422000_.gz
out_batch_147_start_1477422000_end_1477425600_.gz
out_batch_148_start_1477425600_end_1477429200_.gz
out_batch_149_start_1477429200_end_1477432800_.gz
out_batch_14_start_1476943200_end_1476946800_.gz
out_batch_150_start_1477432800_end_1477436400_.gz
out_batch_151_start_1477436400_end_1477440000_.gz
out_batch_152_start_1477440000_end_1477443600_.gz
out_batch_153_start_1477443600_end_1477447200_.gz
out_batch_154_start_1477447200_end_1477450800_.gz
out_batch_155_start_1477450800_end_1477454400_.gz
out_batch_156_start_1477454400_end_1477458000_.gz
out_batch_157_start_1477458000_end_1477461600_.gz
out_batch_158_start_1477461600_end_1477465200_.gz
out_batch_159_start_1477465200_end_1477468800_.gz
out_batch_15_start_1476946800_end_1476950400_.gz
out_batch_160_start_1477468800_end_1477472400_.gz
out_batch_161_start_1477472400_end_1477476000_.gz
out_batch_162_start_1477476000_end_1477479600_.gz
out_batch_163_start_1477479600_end_1477483200_.gz
out_batch_164_start_1477483200_end_1477486800_.gz
out_batch_165_start_1477486800_end_1477490400_.gz
out_batch_166_start_1477490400_end_1477494000_.gz
out_batch_167_start_1477494000_end_1477497600_.gz
out_batch_168_start_1477497600_end_1477501200_.gz
out_batch_16_start_1476950400_end_1476954000_.gz
out_batch_17_start_1476954000_end_1476957600_.gz
out_batch_18_start_1476957600_end_1476961200_.gz
out_batch_19_start_1476961200_end_1476964800_.gz
out_batch_1_start_1476896400_end_1476900000_.gz
out_batch_20_start_1476964800_end_1476968400_.gz
out_batch_21_start_1476968400_end_1476972000_.gz
out_batch_22_start_1476972000_end_1476975600_.gz
out_batch_23_start_1476975600_end_1476979200_.gz
out_batch_24_start_1476979200_end_1476982800_.gz
out_batch_25_start_1476982800_end_1476986400_.gz
out_batch_26_start_1476986400_end_1476990000_.gz
out_batch_27_start_1476990000_end_1476993600_.gz
out_batch_28_start_1476993600_end_1476997200_.gz
out_batch_29_start_1476997200_end_1477000800_.gz
out_batch_2_start_1476900000_end_1476903600_.gz
out_batch_30_start_1477000800_end_1477004400_.gz
out_batch_31_start_1477004400_end_1477008000_.gz
out_batch_32_start_1477008000_end_1477011600_.gz
out_batch_33_start_1477011600_end_1477015200_.gz
out_batch_34_start_1477015200_end_1477018800_.gz
out_batch_35_start_1477018800_end_1477022400_.gz
out_batch_36_start_1477022400_end_1477026000_.gz
out_batch_37_start_1477026000_end_1477029600_.gz
out_batch_38_start_1477029600_end_1477033200_.gz
out_batch_39_start_1477033200_end_1477036800_.gz
out_batch_3_start_1476903600_end_1476907200_.gz
out_batch_40_start_1477036800_end_1477040400_.gz
out_batch_41_start_1477040400_end_1477044000_.gz
out_batch_42_start_1477044000_end_1477047600_.gz
out_batch_43_start_1477047600_end_1477051200_.gz
out_batch_44_start_1477051200_end_1477054800_.gz
out_batch_45_start_1477054800_end_1477058400_.gz
out_batch_46_start_1477058400_end_1477062000_.gz
out_batch_47_start_1477062000_end_1477065600_.gz
out_batch_48_start_1477065600_end_1477069200_.gz
out_batch_49_start_1477069200_end_1477072800_.gz
out_batch_4_start_1476907200_end_1476910800_.gz
out_batch_50_start_1477072800_end_1477076400_.gz
out_batch_51_start_1477076400_end_1477080000_.gz
out_batch_52_start_1477080000_end_1477083600_.gz
out_batch_53_start_1477083600_end_1477087200_.gz
out_batch_54_start_1477087200_end_1477090800_.gz
out_batch_55_start_1477090800_end_1477094400_.gz
out_batch_56_start_1477094400_end_1477098000_.gz
out_batch_57_start_1477098000_end_1477101600_.gz
out_batch_58_start_1477101600_end_1477105200_.gz
out_batch_59_start_1477105200_end_1477108800_.gz
out_batch_5_start_1476910800_end_1476914400_.gz
out_batch_60_start_1477108800_end_1477112400_.gz
out_batch_61_start_1477112400_end_1477116000_.gz
out_batch_62_start_1477116000_end_1477119600_.gz
out_batch_63_start_1477119600_end_1477123200_.gz
out_batch_64_start_1477123200_end_1477126800_.gz
out_batch_65_start_1477126800_end_1477130400_.gz
out_batch_66_start_1477130400_end_1477134000_.gz
out_batch_67_start_1477134000_end_1477137600_.gz
out_batch_68_start_1477137600_end_1477141200_.gz
out_batch_69_start_1477141200_end_1477144800_.gz
out_batch_6_start_1476914400_end_1476918000_.gz
out_batch_70_start_1477144800_end_1477148400_.gz
out_batch_71_start_1477148400_end_1477152000_.gz
out_batch_72_start_1477152000_end_1477155600_.gz
out_batch_73_start_1477155600_end_1477159200_.gz
out_batch_74_start_1477159200_end_1477162800_.gz
out_batch_75_start_1477162800_end_1477166400_.gz
out_batch_76_start_1477166400_end_1477170000_.gz
out_batch_77_start_1477170000_end_1477173600_.gz
out_batch_78_start_1477173600_end_1477177200_.gz
out_batch_79_start_1477177200_end_1477180800_.gz
out_batch_7_start_1476918000_end_1476921600_.gz
out_batch_80_start_1477180800_end_1477184400_.gz
out_batch_81_start_1477184400_end_1477188000_.gz
out_batch_82_start_1477188000_end_1477191600_.gz
out_batch_83_start_1477191600_end_1477195200_.gz
out_batch_84_start_1477195200_end_1477198800_.gz
out_batch_85_start_1477198800_end_1477202400_.gz
out_batch_86_start_1477202400_end_1477206000_.gz
out_batch_87_start_1477206000_end_1477209600_.gz
out_batch_88_start_1477209600_end_1477213200_.gz
out_batch_89_start_1477213200_end_1477216800_.gz
out_batch_8_start_1476921600_end_1476925200_.gz
out_batch_90_start_1477216800_end_1477220400_.gz
out_batch_91_start_1477220400_end_1477224000_.gz
out_batch_92_start_1477224000_end_1477227600_.gz
out_batch_93_start_1477227600_end_1477231200_.gz
out_batch_94_start_1477231200_end_1477234800_.gz
out_batch_95_start_1477234800_end_1477238400_.gz
out_batch_96_start_1477238400_end_1477242000_.gz
out_batch_97_start_1477242000_end_1477245600_.gz
out_batch_98_start_1477245600_end_1477249200_.gz
out_batch_99_start_1477249200_end_1477252800_.gz
out_batch_9_start_1476925200_end_1476928800_.gz


I would like them to be concatenated in a single file. Each gz starts with the same headers. I would then like to keep only headers on the top of the file. In other words for all the
gz
except the first file the first line should be skipped.

How can I do that?

EDIT:
One of the files:

(numeric_python) luca@luca-VirtualBox:~/Documents/datasets/RNDC/7days$ zcat out_batch_1_start_1476896400_end_1476900000_.gz | head
Protocol,Src,Src Port,Dst,Dst Port,Group ID,Port,VPort,IP TOS,VLAN ID,VLAN Pri,MPLS Exp,Application,Packets,Messages,Bytes,First,Last,"EmergingThreats category","EmergingThreats description","EmergingThreats detailedDescription","EmergingThreats first_seen","EmergingThreats last_seen","EmergingThreats ports","EmergingThreats score","Entity AD_SecurityGroup","Entity Privileged","Entity Unique_Users","Entity User","OS OS","STIX description","STIX detailedDescription","STIX timeStamp","iSIGHT ThreatScape","iSIGHT asn","iSIGHT domain","iSIGHT networkIdentifier","iSIGHT networkType","iSIGHT port","iSIGHT productType","iSIGHT publishDate","iSIGHT reportId","iSIGHT reportLink","iSIGHT title","iSIGHT webLink"
ETH:0x0000,00:17:A4:77:9C:04,,09:00:2B:00:00:05,,2,PortC,0,,,,,"Unknown",472,0,28320,1476896408,1476899995,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:17:A4:77:9C:0A,,09:00:2B:00:00:04,,2,PortC,0,,,,,"Unknown",1938,0,125769,1476899771,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:17:A4:77:9C:0A,,09:00:2B:00:00:05,,2,PortC,0,,,,,"Unknown",498,0,29880,1476896400,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:26:F2:C1:00:00,,01:80:C2:00:00:00,,2,PortC,0,,,,,"Unknown",1801,0,108060,1476896401,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:26:F2:C6:00:00,,01:80:C2:00:00:00,,2,PortC,0,,,,,"Unknown",1801,0,108060,1476896401,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:26:F3:2A:00:00,,01:80:C2:00:00:00,,2,PortC,0,,,,,"Unknown",1801,0,108060,1476896401,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:26:F3:8C:00:00,,01:80:C2:00:00:00,,2,PortC,0,,,,,"Unknown",1801,0,108060,1476896401,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,00:30:6E:48:17:18,,09:00:09:00:00:01,,2,PortC,0,,,,,"Unknown",13,0,780,1476896412,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
ETH:0x0000,84:78:AC:15:44:BC,,01:80:C2:00:00:00,,2,PortC,0,,,,,"Unknown",1801,0,108060,1476896401,1476899999,,,,,,,,,,,,,,,,,,,,,,,,,,,
(numeric_python) luca@luca-VirtualBox:~/Documents/datasets/RNDC/7days$

Answer

The following concatenates the files previously stripping off the first lines:

find out_batch_103_start_*.gz -exec sh -c "zcat -q -c {} | tail -n +2 -" \;

where tail -n +2 - outputs the lines read from the standard input (-) starting from the second line.

It is easy to prepend a header to the output:

zcat out_batch_100_start_1477252800_end_1477256400_.gz | head -1 > output
find out_batch_103_start_*.gz \
  -exec sh -c "zcat -q -c {} | tail -n +2 -" \; >> output

The result is written to the output file.

If you want to compress the output on-the-fly, then you should group the two commands:

{ zcat out_batch_100_start_1477252800_end_1477256400_.gz | head -1 && \
    find out_batch_103_start_*.gz \
      -exec sh -c "zcat -q -c {} | tail -n +2 -" \;
      } | gzip - > output.gz

where { } creates a group of commands in Bash (see info bash {).