BigRedEO BigRedEO - 5 months ago 24
Perl Question

Perl parse CSV file "fill" and "null" fields

Okay - I'm going to post my entire script since I get chastised when I don't do it - even though, last time I did that I got chastised for posting the whole script. I simply need to know if the one line I originally asked about would work. ENTIRE SCRIPT (which was working just fine until the other dept gave me their data entirely differently than what we were originally told it would be) TO FOLLOW AT THE END

I'm parsing through and scrubbing a CSV file to make it ready to be loaded in a MySQL table. It is loaded through the table via someone else's "batch Java program" and if any field is empty the batch file stops with an error.

I've been told to just put in a blank space whenever there's an empty field in any record. Would something as simple as this work?

if ( ! length $fields[2] ) {
$_ = ' ' for $fields[2];
}


And would there be a way to check either various multiple fields at once? Or what might be better would be to check ALL the fields (this is after the record has been split) as the last thing I do just before writing the record back out to the CSV file.

Here's the entire script. Please don't tell me how what I'm doing within the already working script is not how you would do it. -

#!/usr/bin/perl/

use strict;
use warnings;
use Data::Dumper;
use Time::Piece;

my $filename = 'mistints_1505_comma.csv';
#my $filename = 'test.csv';

# Open input file
open my $FH, $filename
or die "Could not read from $filename <$!>, program halting.";

# Open error handling file
open ( my $ERR_FH, '>', "errorFiles1505.csv" ) or die $!;

# Read the header line of the input file and print to screen.
chomp(my $line = <$FH>);
my @fields = split(/,/, $line);
print Dumper(@fields), $/;

my @data;

# Read the lines one by one.
while($line = <$FH>) {

chomp($line);

# Scrub data of characters that cause scripting problems down the line.
$line =~ s/[\'\\]/ /g;

# split the fields of each record

my @fields = split(/,/, $line);

# Check if the storeNbr field is empty. If so, write record to error file.
if (!length $fields[28]) {
chomp (@fields);
my $str = join ',', @fields;
print $ERR_FH "$str\n";
}
else
{

# Concatenate the first three fields and add to the beginning of each record
unshift @fields, join '_', @fields[28..30];

# Format the DATE fields for MySQL
$_ = join '-', (split /\//)[2,0,1] for @fields[10,14,24,26];

# Scrub colons from the data
$line =~ s/:/ /g;

# If Spectro_Model is "UNKNOWN", change
if($fields[22] eq "UNKNOWN"){
$_ = 'UNKNOW' for $fields[22];
}

# If tran_date is blank, insert 0000-00-00
if(!length $fields[10]){
$_ = '0000-00-00' for $fields[10];
}

# If init_tran_date is blank, insert 0000-00-00
if(!length $fields[14]){
$_ = '0000-00-00' for $fields[14];
}

# If update_tran_date is blank, insert 0000-00-00
if(!length $fields[24]){
$_ = '0000-00-00' for $fields[24];
}

# If cancel_date is blank, insert 0000-00-00
if(!length $fields[26]){
$_ = '0000-00-00' for $fields[26];
}

# Format the PROD_NBR field by deleting any leading zeros before decimals.
$fields[12] =~ s/^\s*0\././;

# put the records back
push @data, \@fields;
}
}

close $FH;
close $ERR_FH;

print "Unsorted:\n", Dumper(@data); #, $/;

#Sort the clean files on Primary Key, initTranDate, updateTranDate, and updateTranTime
@data = sort {
$a->[0] cmp $b->[0] ||
$a->[14] cmp $b->[14] ||
$a->[26] cmp $b->[26] ||
$a->[27] cmp $b-> [27]
} @data;

#open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/parsedMistints.csv';
open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/cleaned1505.csv';
print $OFH join(',', @$_), $/ for @data;
close $OFH;

exit;

Answer

As far as I can tell you have split a record on commas ,, and you want to alter all fields that are empty strings to contain a single space

I would write this

use strict;
use warnings qw/ all FATAL /;

my $record = 'a,b,c,,e,,g,,i,,k,,m,n,o,p,q,r,s,t';

my @fields = map { $_ eq "" ? ' ' : $_ } split /,/, $record;


use Data::Dump;
dd \@fields;

output

[ "a", "b", "c", " ", "e", " ", "g", " ", "i", " ", "k", " ", "m" .. "t" ]

Alternatively, if you have some fields that need to be set to something different if they are empty, you can set up an array of defaults

That would look like this. All of the @defaults array is set to spaces except for fields 10, 11 and 12, which are 0000-00-00. These are picked up after the record is split

use strict;
use warnings qw/ all FATAL /;

my @defaults = (' ') x 20;

$defaults[$_] = '0000-00-00' for 9, 10, 11;

my $record = 'a,b,c,,e,,g,,i,,k,,m,n,o,p,q,r,s,t';

my @fields = split /,/, $record;

for my $i ( 0 .. $#fields ) {
    $fields[$i] = $defaults[$i] if $fields[$i] eq '';
}


use Data::Dump;
dd \@fields;

output

[ "a", "b", "c", " ", "e", " ", "g", " ", "i", "0000-00-00", "k", "0000-00-00", "m" .. "t" ]



Having seen your full program, I recommend something like this. If you had shown a sample of your input data then I could have used a hash to refer to column names instead of numbers, making it much more readable

#!/usr/bin/perl/

use strict;
use warnings 'all';

use Data::Dumper;
use Time::Piece;

my $filename = 'mistints_1505_comma.csv';
#my $filename = 'test.csv';

open my $FH, $filename
        or die "Could not read from $filename <$!>, program halting.";

open( my $ERR_FH, '>', "errorFiles1505.csv" ) or die $!;

chomp( my $line = <$FH> );
my @fields = split /,/, $line;    #/
print Dumper( \@fields ), "\n";

my @data;

# Read the lines one by one.
while ( <$FH> ) {

    chomp;

    # Scrub data of characters that cause scripting problems down the line.
    tr/'\\/  /;                   #'

    my @fields = split /,/;       #/

    # Check if the storeNbr field is empty.  If so, write record to error file.

    if ( $fields[28] eq "" ) {
        my $str = join ',', @fields;
        print $ERR_FH "$str\n";
        next;
    }

    # Concatenate the first three fields and add to the beginning of each record
    unshift @fields, join '_', @fields[ 28 .. 30 ];

    # Format the DATE fields for MySQL
    $_ = join '-', ( split /\// )[ 2, 0, 1 ] for @fields[ 10, 14, 24, 26 ];

    # Scrub colons from the data
    tr/://d;                      #/

    my $i = 0;
    for ( @fields ) {

        # If "Spectro_Model" is "UNKNOWN" then change to "UNKNOW"
        if ( $i == 22 ) {
            $_ = 'UNKNOW' if $_ eq 'UNKNOWN';
        }

        # If a date field is blank then insert 0000-00-00
        elsif ( grep { $i == $_ } 10, 14, 24, 26 ) {
            $_ = '0000-00-00' if $_ eq "";
        }

        # Format the PROD_NBR field by deleting any leading zeros before decimals.
        elsif ( $i == 12 ) {
            s/^\s*0\././;
        }

        # Change all remaining empty fields to a single space
        else {
            $_ = ' ' if $_ eq "";
        }

        ++$i;
    }

    push @data, \@fields;
}

close $FH;
close $ERR_FH;

print "Unsorted:\n", Dumper(@data);    #, $/;

#Sort the clean files on Primary Key, initTranDate, updateTranDate, and updateTranTime
@data = sort {
    $a->[0] cmp $b->[0]   or
    $a->[14] cmp $b->[14] or
    $a->[26] cmp $b->[26] or
    $a->[27] cmp $b->[27]
} @data;

#open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/parsedMistints.csv';
open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/cleaned1505.csv' or die $!;
print $OFH join(',', @$_), $/ for @data;
close $OFH;