Josh Carter Josh Carter - 4 months ago 8
Perl Question

Using shell script to redesign a CSV file



Currently I am using a Perl script that parses a log file and outputs a CSV file in this format:

$ read_scalepack.pl HR00169_07.log

mosaicity 0.52 - 1.59

SG & cell: p21 64.3 78.1 67.0 90.0 96.8 90.0



Resolution Measured Unique Redun. %Complete R-value I/sigI


Overall 50.00-2.40 60827 23031 2.6 89.1 0.073 11.3
First Shell 50.00-5.17 7173 2431 3.0 91.9 0.044 21.1
3 sig Shell 2.85-2.70 6473 2446 2.6 95.4 0.316 3.4
Penult Shell 2.59-2.49 4149 2056 2.0 80.1 0.370 2.4
Last Shell 2.49-2.40 3105 1722 1.8 66.9 0.343 2.1

Rejected observations. Number: 1599 Percent: 2.6


I then use a very long method of importing this into separate worksheet in Excel, copying values in the format I want to the "needed" sheet then deleting the imported CSV sheet (painful to watch). What I need is to either edit the Perl script (which I have no experience with) or use a shell script to have the CSV file be as follows:

p21 64.3 78.1 67.0 90.0 96.8 90.0 0.52-1.59 50.00-2.40 60827 23031 2.6 89.1 0.073 50.00-5.17 7173 2431 3.0 91.9 0.044 2.49-2.40 3105 1722 1.8 66.9 0.343 2.1


Basically one long line of statistics instead of stacked statistics with words.

I am using in one line the values from ("SG and cell", "mosaicity", "Overall", "First Shell", "Last Shell"). So I am ignoring some lines as they are not useful.

I researched the following article: Editing CSV Files (Design Implementation)

This person seems to be doing what I want to do, however they do not show exactly how, or at least the part I need.

I am also attaching a part of the Perl script in case it is better to approach it from that end, but again I have very little experience with Perl (warning Perl script is long).

#!/usr/bin/perl -w

...

#
#initialize
$int_over_sigma_cutoff = 2.95;
$flag_correct_lp = 0;

# get information on file
if ( defined($last_file) ) {
@stat_list = stat( $images_directory . '/' . $images_dir_list[$last_file] );
$date_time = localtime( $stat_list[9] );
$date_time =~ s/^\s+//; # remove white space at beginning of line
@date_parts = split( / +/, $date_time );
$date = $date_parts[2] . '-' . uc( $date_parts[1] ) . '-' . $date_parts[4];
}
}

return $date;
}

sub read_xscale_lp {
#
my $junk;
my $flag_sg_cell = 0;
my $flag_by_shells = 0;
#
# Read XSCALE.LP log file
while ( <> ) {
#
# get space group
if (/ THE DATA COLLECTION STATISTICS REPORTED BELOW ASSUMES:/) {
$flag_sg_cell = 1;
}
#
# get unit cell parameters
elsif ($flag_sg_cell) {

$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );

if (/ SPACE_GROUP_NUMBER=/) {
$space_group = &get_space_group_name( $line_parts[1] );
}
elsif (/ UNIT_CELL_CONSTANTS=/) {
@cell = @line_parts[ 1 .. 7 ];
$flag_sg_cell = 0;
}
}
#
# get rejected measurements and total number of measured reflections
elsif (/REFLECTIONS REJECTED/) {

$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );
$number_rejected = $line_parts[0];
$overall_measured_w_rej = $line_parts[3];
}
#
# test whether column 10 will be Rmrgd-F or CC(1/2)
elsif (/ Rmrgd-F/) {
$column10 = "Rmrgd-F";
}
elsif (m| CC\(1/2\)|) {
$column10 = "CC1/2";
}
#
# Find table and set flag
elsif (/ SUBSET OF INTENSITY DATA WITH SIGNAL\/NOISE >= \-3.0 AS FUNCTION OF RESOLUTION/) {

$flag_by_shells = 1;
$is = 0;

for ( $i = 0; $i <= 2; $i++ ) {
$junk = <>;
}
}
#
# Make sure table flag is turned off
elsif (/ SUBSET OF INTENSITY DATA WITH SIGNAL\/NOISE >= \-2.0 AS FUNCTION OF RESOLUTION/) {
$flag_by_shells = 0;
}
#
# Get information about R-values, I/sigI, etc.
elsif ($flag_by_shells) {

$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
$line =~ s/%//g; # remove percentage sign from numbers

@line_parts = split( / +/, $line );
#
# Get overall values
if ( $line =~ /total/ ) {
$overall_measured = $line_parts[1];
$overall_unique = $line_parts[2];

# $overall_theoretical[is] = $line_parts[3];
$overall_complete = $line_parts[4];
$overall_rvalue = $line_parts[5] / 100;
$overall_int_over_sig = $line_parts[8];

# $overall_r_redun_i[$is] = $line_parts[9];
if ( $column10 eq 'Rmrgd-F' ) {

# $overall_r_redun_f[$is] = $line_parts[10];
}
elsif ( $column10 eq 'CC1/2' ) {
$line_parts[10] =~ s/\*//;
$overall_cc_onehalf = $line_parts[10] / 100.;
}

$flag_by_shells = 0;
}
#
# Store individuals shells in arrays
else {

if ( $is > 0 ) {
$low_res[$is] = $high_res[ $is - 1 ];
}

$high_res[$is] = $line_parts[0];
$measured[$is] = $line_parts[1];
$unique[$is] = $line_parts[2];

# $theoretical[is] = $line_parts[3];
$complete[$is] = $line_parts[4];
$rvalue[$is] = $line_parts[5] / 100;
$int_over_sig[$is] = $line_parts[8];

# $r_redun_i[$is] = $line_parts[9];
if ( $column10 eq 'Rmrgd-F' ) {

# $r_redun_f[$is] = $line_parts[10];
}
elsif ( $column10 eq 'CC1/2' ) {
$line_parts[10] =~ s/\*//;
$cc_onehalf[$is] = $line_parts[10] / 100.;
}

$is++;
}
}
}
#
# Get overall low and high resolution
if ( !defined( $low_res[0] ) ) {
$low_res[0] = 50.;
}

$overall_low_res = $low_res[0];
$overall_high_res = $high_res[$#high_res];
}

sub read_mrfana {
#
# read autoPROC mrfana file and reduce binning to 10
my $line;
my @line_parts;
my $flag_table = 0;
my $ishell = 0;
my $jshell = 0;
#
while (<>) {
if (/binning will be in/) {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@line_parts = split( / +/, $line );

$no_of_shells = $line_parts[4];
$no_of_new_shells = $no_of_shells / 10;

if ( $no_of_new_shells != int($no_of_new_shells) ) {
$flag_not_integer = 1;
}
}
elsif ( defined($flag_not_integer) && /binning for run/ ) {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@line_parts = split( / +/, $line );

$no_of_shells = $line_parts[7];
$no_of_new_shells = $no_of_shells / 10;

if ( $no_of_new_shells != int($no_of_new_shells) ) {
die "$no_of_new_shells not evenly divisible by 10\n";
}
else {
undef($flag_not_integer);
}
}
elsif (/^ Resolution \#uniq \#R\w\w\w /) {
$line = <>;
$flag_table = 1;
}
elsif ( $flag_table == 1 ) {

if (/^ -------/) {

if ( $ishell != 10 ) {

$high_res[$ishell] = $line_parts[2];

if ( $unique[$ishell] > 0 ) {
$rvalue[$ishell] = $rvalue[$ishell] / $unique[$ishell];
$int_over_sig[$ishell] = $int_over_sig[$ishell] / $unique[$ishell];
$complete[$ishell] = 100. * $complete[$ishell] / $unique[$ishell];
$redundancy[$ishell] = $measured[$ishell] / $unique[$ishell];
}
}
else {
next;
}
}
elsif (/^ Total/) {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@line_parts = split( / +/, $line );

$overall_low_res = $line_parts[1];
$overall_high_res = $line_parts[3];
$overall_unique = $line_parts[4];
$overall_measured = $line_parts[5];
$overall_rvalue = $line_parts[6];
$overall_int_over_sig = $line_parts[13];
$overall_complete = 100. * $line_parts[14];
$overall_redundancy = $line_parts[16];

if ( $#line_parts <= 17 ) {
$number_rejected = 0;
}
elsif ( $#line_parts >= 25 ) {
$number_rejected
= $line_parts[18]
+ $line_parts[19]
+ $line_parts[20]
+ $line_parts[21]
+ $line_parts[22]
+ $line_parts[23]
+ $line_parts[24]
+ $line_parts[25];
}
else {
$number_rejected = $line_parts[18];
}
last;
}
else {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@line_parts = split( / +/, $line );

if ( $jshell % 10 == 0 ) {
$low_res[$ishell] = $line_parts[0];
$unique[$ishell] = $line_parts[3];
$measured[$ishell] = $line_parts[4];
$rvalue[$ishell] = $line_parts[5] * $line_parts[3];
$int_over_sig[$ishell] = $line_parts[12] * $line_parts[3];
$complete[$ishell] = $line_parts[13] * $line_parts[3];
}
else {
$unique[$ishell] = $unique[$ishell] + $line_parts[3];
$measured[$ishell] = $measured[$ishell] + $line_parts[4];
$rvalue[$ishell] = $rvalue[$ishell] + $line_parts[5] * $line_parts[3];
$int_over_sig[$ishell]
= $int_over_sig[$ishell] + $line_parts[12] * $line_parts[3];
$complete[$ishell] = $complete[$ishell] + $line_parts[13] * $line_parts[3];
}

if ( $jshell % 10 == 9 ) {

$high_res[$ishell] = $line_parts[2];

if ( $unique[$ishell] > 0 ) {
$rvalue[$ishell] = $rvalue[$ishell] / $unique[$ishell];
$int_over_sig[$ishell] = $int_over_sig[$ishell] / $unique[$ishell];
$complete[$ishell] = 100. * $complete[$ishell] / $unique[$ishell];
$redundancy[$ishell] = $measured[$ishell] / $unique[$ishell];
}

$ishell++;
}

$jshell++;
}
}
}

return;
}

sub read_xengen {
#
my $i;
my $junk;
my $flag_sg_cell = 0;
my $flag_redundancy = 0;
my $flag_int_over_sig = 0;
#
while (<>) {
if (/Statistical Treatment of Crystallographic Data for/) {
$flag_sg_cell = 1;
}
#
# read through 2 blank lines and "Crystal Description:"
elsif ($flag_sg_cell) {
#
# get space group
if (/Crystal Spacegroup\:/) {

$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );

$line_parts[5] =~ s/\,//;
$space_group = &get_space_group_name( $line_parts[5] );
}
#
# get unit cell parameters
elsif (/Real-Space Parameters\:/) {

$_ = <>; # parameters are on next line
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );

@cell = @line_parts[ 0 .. 5 ];
$flag_sg_cell = 0;
}
}
elsif (/Summary of Observation Redundancies by Shells\:/) {

for ( $i = 0; $i <= 2; $i++ ) {
$junk = <>;
}

$is = 0;
$flag_redundancy = 1;
}
elsif ($flag_redundancy) {

$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );
#
# Get overall values
if ( $line =~ /Totals\:/ ) {

# $overall_theoretical = $line_parts[2];
$overall_unique = $line_parts[3];

if ( $line_parts[2] > 0 ) {
$overall_complete = 100 * $line_parts[3] / $line_parts[2];
}
else {
$overall_complete = 0;
}

$overall_measured = $line_parts[13];
$flag_redundancy = 0;
}
else {

if ( $is > 0 ) {
$low_res[$is] = $high_res[ $is - 1 ];
}

$high_res[$is] = $line_parts[0];

# $theoretical[$is] = $line_parts[2];
$unique[$is] = $line_parts[3];

if ( $line_parts[2] > 0 ) {
$complete[$is] = 100 * $line_parts[3] / $line_parts[2];
}
else {
$complete[$is] = 0;
}
$measured[$is] = $line_parts[13];
$is++;
}
}
elsif (/Summary of Reflection Intensities and R-factors by Shells\:/) {

for ( $i = 0; $i <= 3; $i++ ) {
$junk = <>;
}

$is = 0;
$flag_int_over_sig = 1;
}
elsif ($flag_int_over_sig) {

$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );
#
# Get overall values for I/sigI and R-value
if ( $line =~ /Totals\:/ ) {
$overall_int_over_sig = $line_parts[2];
$overall_rvalue = $line_parts[17] / 100;
$flag_int_over_sig = 0;
}
#
# Store individual shells in arrays for I/sigI and R-value
else {
$int_over_sig[$is] = $line_parts[2];
$rvalue[$is] = $line_parts[17] / 100;
$is++;
}
}
}
}

sub check_for_scala_log {
my @dir_pieces = split( /\//, $_[0] );
my $line;
my @line_parts;
my $scala_log;
my $flag_dataset_id = 0;
my $flag_space_group = 0;
my $count = 0;
#
if ( $#dir_pieces > 0 ) {
$scala_log = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/scala.log';
}
else {
$scala_log = 'scala.log';
}
if ( -e $scala_log ) {

open( LOG, "<$scala_log" ) || die "Unable to open $scala_log\n";

while (<LOG>) {

if ( m|Dataset ID, project/crystal/dataset names, cell dimensions, wavelength|
&& $flag_dataset_id == 0 ) {

$flag_dataset_id = 1;

$line = <LOG>; # blank line -- skip
$line = <LOG>; # Dataset ID project name
$line = <LOG>; # crystal name
$line = <LOG>; # dataset name
$line = <LOG>; # cell dimensions
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@cell = split( / +/, $line );

$line = <LOG>; # wavelength
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

$wavelength = $line;
$count = $count + 6;
}
elsif ( /Space group =/ && $flag_space_group == 0 ) {
$flag_space_group = 1;
$line = $_;
$space_group = substr(
$line,
index( $line, "\'" ) + 1,
rindex( $line, "\'" ) - ( index( $line, "\'" ) + 1 ) );
$space_group =~ s/ //g;
$count = $count + 2;
}

$count++;
}
}
}

sub check_for_aimless_log {
my @dir_pieces = split( /\//, $_[0] );
my $line;
my @line_parts;
my $aimless_log;
my $flag_average_unit_cell = 0;
my $flag_space_group = 0;
#
if ( $#dir_pieces > 0 ) {
$aimless_log = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/aimless.log';
}
else {
$aimless_log = 'aimless.log';
}

if ( -e $aimless_log ) {

open( LOG, "<$aimless_log" ) || die "Unable to open $aimless_log\n";

while (<LOG>) {

if ( /Average unit cell/ && $flag_average_unit_cell == 0 ) {

$flag_average_unit_cell = 1;

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@line_parts = split( / +/, $line );

@cell = @line_parts[ 3 .. 8 ];
}
elsif (/Wavelength/) {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

@line_parts = split( / +/, $line );

$wavelength = $line_parts[1];
}
elsif (/Space group\:/) {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);

$space_group = substr( $line, index( $line, "\:" ) + 1 );
$space_group =~ s/ //g;
}
elsif (/Average mosaicity/) {

$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line

@line_parts = split( / +/, $line );

$average_mosaicity = $line_parts[2];

if ( $mosaicity_min >= 1000. ) {
$mosaicity_min = $average_mosaicity;
}

if ( $mosaicity_max <= -1000. ) {
$mosaicity_max = $average_mosaicity;
}
}
}
}
}

sub read_scl_in {
#
# read scl.in, extract some information, and store scl.in in list
my $i_zone = 0;
my $line;
my @line_parts;
my $flag_est_err = 0;
my $file;
#
my @dir_pieces = split( /\//, $_[0] );

if ( $#dir_pieces > 0 ) {
$file = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/scl.in';
}
else {
$file = 'scl.in';
}
#
# Open old file
# open(INFILE,"<$file") or die "unable to open $file";
open( INFILE, "<$file" ) or return;

while (<INFILE>) {

$line = $_;
chomp($line);

if (/number of zones/) {

$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );

$n_zone = $line_parts[3];
}
elsif (/estimated error/) {
$flag_est_err = 1;
}
elsif ( /space group/ && !defined($space_group) ) {
$space_group = 1;
}
elsif ( $flag_est_err == 1 ) {

$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );

if ( $i_zone == 0 ) {
@est_err = @line_parts;
}
else {
push( @est_err, @line_parts );
}

$i_zone = $i_zone + scalar(@line_parts);

if ( $i_zone == $n_zone ) {
last;
}
}
}

close(INFILE);
}

sub read_output_sca {
#
# read output.sca to get cell parameters when reading scale.out for statistics
my $junk;
my $line;
my @line_parts;
my $file;
#
my @dir_pieces = split( /\//, $_[0] );

if ( $#dir_pieces > 0 ) {
$file = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/output.sca';
}
else {
$file = 'output.sca';
}
#
if ( -e $file ) {

open( SCA, "<$file" );
$junk = <SCA>; # Skip first and second lines
$junk = <SCA>;
$line = <SCA>; # Unit cell and space group are third line
close(SCA);

$line =~ s/^\s+//; # remove white space at beginning of line

@line_parts = split( / +/, $line );

@cell = @line_parts[ 0 .. 5 ]; # Unit cell parameters are first 6 values
$space_group = $line_parts[6]; # Space group is 7th value
}
}

sub get_space_group_name {
#
# Use space group dictionary to translate from number to name
$space_group_number = $_[0];

open( SGDIC, "</ap/bms/mmc/space.dic" );

while (<SGDIC>) {

if (/^(P|A|B|C|I|F|R)/) {

$line = $_;
chomp($line);

if ( substr( $line, 21, 3 ) == $space_group_number ) {
@line_parts = split( / +/, $line );
$space_group = $line_parts[0];
last;
}
}
}

return $space_group;
}

Answer

Put the output from your Perl script through this. It will generate the result you require

You can pass the previous output as a parameter on the command line, or you can use a pipe to pass output of the previous script to this one without saving a file at all

use strict;
use warnings 'all';

my @data;

while ( <> ) {

    s/\s*-\s*/-/;

    my @fields = grep /\d/, split;

    pop @fields if /Overall|First Shell/;

    if ( /mosaicity|Overall|First Shell|Last Shell/ ) {
        push @data, \@fields;
    }
    elsif ( /SG & cell/ ) {
        unshift @data, \@fields;
    }
}

print join(' ', map @$_, @data), "\n";

output

p21 64.3 78.1 67.0 90.0 96.8 90.0 0.52-1.59 50.00-2.40 60827 23031 2.6 89.1 0.073 50.00-5.17 7173 2431 3.0 91.9 0.044 2.49-2.40 3105 1722 1.8 66.9 0.343 2.1