#!/usr/local/bin/perl ############################################################################## # DESCRIPTION: # # This script takes data file, and extra attribute as inputs, and gives output # data file in a particular format. It also addes extra attribute in beginning # of each line in the output file. # # Additionally, if the number of fields in a line in the input file does not # match with the number of pre-defined fields, then such lines are filtered # and are stored in a discard output file. # # Version: 0.1 13-Feb-2007 Rajkumar Jain Initial Version ############################################################################ # Input file format is as following. # # ATTRIBUTE03|ATTRIBUTE01|ATTRIBUTE05|ATTRIBUTE02|ATTRIBUTE04| # 01_attr_03|01_attr_01|01_attr_05|01_attr_02|01_attr_04| # 02_attr_03|02_attr_01|02_attr_05|02_attr_02|| # 03_attr_03|03_attr_01|03_attr_05|03_attr_02| # 04_attr_03|04_attr_01|04_attr_05| # 05_attr_03|05_attr_01|05_attr_05|||05_attr_04| # 06_attr_03|06_attr_01|06_attr_05|06_attr_02|06_attr_04| # $num_args = @ARGV; # If wrong number of arguments are passed to Perl script, then exit if ($num_args != 2) { print "Incorrect number of arguments to perl \n"; exit 1; } $inp_file=@ARGV[0]; $extra_attr=@ARGV[1]; # Array containing column names (headers) in pre-defined order # This can be changed as needed @col_names = ( "ATTRIBUTE01", "ATTRIBUTE02", "ATTRIBUTE03", "ATTRIBUTE04", "ATTRIBUTE05" ); $file_line_count=1; $flag = 0; # Defining the output data file and discard data file names $output_file=$inp_file."."."out"; $discard_file=$inp_file."."."discard"; # Defining File handles for input and output files open (INPUT, "<$inp_file") || die "Can't open file for reading"; open (OUTPUT,">$output_file") || die "Can't open the output file for writing"; # Printing header names in the first line of output file print OUTPUT "$extra_attr"."|"; foreach $column_name (@col_names) { print OUTPUT "$column_name"."|"; } print OUTPUT "\n"; # Loop to itertate through each line of input file while() { $elements_count = 0; # This processing is for the first line (containing headers). # We re-order the columns as per the order defined in array @col_names if($file_line_count == 1) { @line_items = split(/\|/); $header_count = @line_items; $file_line_count=$file_line_count +1; # Prepare a hash map between headercolumns in the input file # and its index in the the col array for ($i=0; $i<$header_count - 1;$i++) { for ($j=0; $j<=$#col_names ;$j++) { if($line_items[$i] eq $col_names[$j] ) { $HashOrder{$col_names[$j]} = $i; } } } # Assign "" to all those columns which dont contain values for ($j=0; $j<=$#col_names ;$j++) { if ($HashOrder{$col_names[$j]} != 0 && !($HashOrder{$col_names[$j]}) ) { $HashOrder{$col_names[$j]} = ""; } } next; } #Get the current line $cur_file_line=$_; # Remove the spaces from beginning and end of the current line # This is similar to trimming the line $cur_file_line =~ s/^\s//; $cur_file_line =~ s/\s$//; $file_line=""; if ($cur_file_line ne "") { @line_items = split(/\|/); $elements_count = @line_items; # If the number of values in the line matches with the number of headers, # then we store the line in the output file if($elements_count == $header_count) { #rearrange according to the column positions for ($j=0; $j<=$#col_names ;$j++) { if($HashOrder{$col_names[$j]} ne "") { $file_line = $file_line.$line_items[$HashOrder{$col_names[$j]}]; } $file_line = $file_line."|"; } # $extra_attr is prefixed in every line and then line # is printed in output file $file_line = $extra_attr."|".$file_line."\n"; print OUTPUT "$file_line"; } # If the number of values in the line does not match with the number of headers, # then we store the line in the discard output file else { # The message below (The following.... ) is printed only once in # the discard output file. This is taken care of by value of $flag. if ($flag == 0) { open (DISCARD,">$discard_file") || die "Can't open the discard file for writing"; print DISCARD "The following lines failed the field count validation \n"; $flag =1; } # Printing the line of discard data in discard output file print DISCARD "$cur_file_line"."\n"; } } #End of ($cur_file_line ne "") } #End of While # Closing file handles close(OUTPUT); if($flag == 1) { close(DISCARD); } close(INPUT);