Welcome, guest | Sign In | My Account | Store | Cart
#!/usr/local/bin/perl

##############################################################################
# DESCRIPTION:
#
# This script takes data file, and extra attribute as inputs, and gives output
# data file in a particular format. It also addes extra attribute in beginning
# of each line in the output file.
#
# Additionally, if the number of fields in a line in the input file does not
# match with the number of pre-defined fields, then such lines are filtered
# and are stored in a discard output file.
#
# Version: 0.1     13-Feb-2007  Rajkumar Jain          Initial Version

############################################################################

# Input file format is as following.
#
#  ATTRIBUTE03|ATTRIBUTE01|ATTRIBUTE05|ATTRIBUTE02|ATTRIBUTE04|
#  01_attr_03|01_attr_01|01_attr_05|01_attr_02|01_attr_04|
#  02_attr_03|02_attr_01|02_attr_05|02_attr_02||
#  03_attr_03|03_attr_01|03_attr_05|03_attr_02|
#  04_attr_03|04_attr_01|04_attr_05|
#  05_attr_03|05_attr_01|05_attr_05|||05_attr_04|
#  06_attr_03|06_attr_01|06_attr_05|06_attr_02|06_attr_04|
#


$num_args
= @ARGV;

# If wrong number of arguments are passed to Perl script, then exit
if ($num_args != 2)
{
       
print "Incorrect number of arguments to perl \n";
       
exit 1;
}

$inp_file
=@ARGV[0];
$extra_attr
=@ARGV[1];


# Array containing column names (headers) in pre-defined order
# This can be changed as needed
@col_names = (    "ATTRIBUTE01",
                 
"ATTRIBUTE02",
                 
"ATTRIBUTE03",
                 
"ATTRIBUTE04",
                 
"ATTRIBUTE05"
                 
);


$file_line_count
=1;  

$flag
= 0;

# Defining the output data file and discard data file names
$output_file
=$inp_file."."."out";
$discard_file
=$inp_file."."."discard";

# Defining File handles for input and output files
open
(INPUT, "<$inp_file") || die "Can't open file for reading";
open
(OUTPUT,">$output_file") || die "Can't open the output file for writing";

# Printing header names in the first line of output file
print OUTPUT "$extra_attr"."|";

foreach $column_name (@col_names) {
                       
print OUTPUT "$column_name"."|";
}

print OUTPUT "\n";


# Loop to itertate through each line of input file
while(<INPUT>)
{
   
    $elements_count
= 0;

   
# This processing is for the first line (containing headers).
   
# We re-order the columns as per the order defined in array @col_names
   
if($file_line_count == 1)
   
{
       
@line_items = split(/\|/);
        $header_count
= @line_items;
        $file_line_count
=$file_line_count +1;

       
# Prepare a hash map between headercolumns in the input file
       
# and its index in the the col array
       
for ($i=0; $i<$header_count - 1;$i++)
         
{
               
for ($j=0; $j<=$#col_names ;$j++) {
                       
if($line_items[$i] eq $col_names[$j] )  {
                                 $HashOrder
{$col_names[$j]} = $i;
                                 
                         
}
                 
}
         
}
         
       
# Assign "" to all those columns which dont contain values
       
for ($j=0; $j<=$#col_names ;$j++) {
               
if ($HashOrder{$col_names[$j]} != 0 && !($HashOrder{$col_names[$j]}) ) {
                        $HashOrder
{$col_names[$j]} = "";
                       
               
}
         
}
       
       
next;
   
}

   
#Get the current line
    $cur_file_line
=$_;
   
   
# Remove the spaces from beginning and end of the current line
   
# This is similar to trimming the line
    $cur_file_line
=~ s/^\s//;
    $cur_file_line
=~ s/\s$//;
   
    $file_line
="";

   
if ($cur_file_line ne "")
   
{
         
@line_items = split(/\|/);
          $elements_count
= @line_items;
         
         
# If the number of values in the line matches with the number of headers,
         
# then we store the line in the output file
         
if($elements_count ==  $header_count)  
         
{
         
                 
#rearrange according to the column positions
                 
for ($j=0; $j<=$#col_names ;$j++) {

                       
if($HashOrder{$col_names[$j]} ne "")
                       
{
                                $file_line
= $file_line.$line_items[$HashOrder{$col_names[$j]}];
                       
}

                        $file_line
= $file_line."|";

                 
}
           
               
# $extra_attr is prefixed in every line and then line
               
# is printed in output file
                $file_line
= $extra_attr."|".$file_line."\n";
               
print OUTPUT "$file_line";
         
}
         
         
# If the number of values in the line does not match with the number of headers,
         
# then we store the line in the discard output file
         
else
         
{
               
# The message below (The following.... ) is printed only once in
               
# the discard output file. This is taken care of by value of $flag.
               
if ($flag == 0)
                 
{
                        open
(DISCARD,">$discard_file") || die "Can't open the discard file for writing";
                       
print DISCARD "The following lines failed the field count validation \n";
                        $flag
=1;
                   
}
                   
               
               
# Printing the line of discard data in discard output file  
               
print DISCARD "$cur_file_line"."\n";
         
}
   
}  #End of ($cur_file_line ne "")
   
 
}  #End of While


# Closing file handles
close
(OUTPUT);

if($flag == 1)
{
     close
(DISCARD);
}

close
(INPUT);

History