#!/usr/local/bin/perl
##############################################################################
# DESCRIPTION:
#
# This script takes data file, and extra attribute as inputs, and gives output
# data file in a particular format. It also addes extra attribute in beginning
# of each line in the output file.
#
# Additionally, if the number of fields in a line in the input file does not
# match with the number of pre-defined fields, then such lines are filtered
# and are stored in a discard output file.
#
# Version: 0.1 13-Feb-2007 Rajkumar Jain Initial Version
############################################################################
# Input file format is as following.
#
# ATTRIBUTE03|ATTRIBUTE01|ATTRIBUTE05|ATTRIBUTE02|ATTRIBUTE04|
# 01_attr_03|01_attr_01|01_attr_05|01_attr_02|01_attr_04|
# 02_attr_03|02_attr_01|02_attr_05|02_attr_02||
# 03_attr_03|03_attr_01|03_attr_05|03_attr_02|
# 04_attr_03|04_attr_01|04_attr_05|
# 05_attr_03|05_attr_01|05_attr_05|||05_attr_04|
# 06_attr_03|06_attr_01|06_attr_05|06_attr_02|06_attr_04|
#
$num_args = @ARGV;
# If wrong number of arguments are passed to Perl script, then exit
if ($num_args != 2)
{
print "Incorrect number of arguments to perl \n";
exit 1;
}
$inp_file=@ARGV[0];
$extra_attr=@ARGV[1];
# Array containing column names (headers) in pre-defined order
# This can be changed as needed
@col_names = ( "ATTRIBUTE01",
"ATTRIBUTE02",
"ATTRIBUTE03",
"ATTRIBUTE04",
"ATTRIBUTE05"
);
$file_line_count=1;
$flag = 0;
# Defining the output data file and discard data file names
$output_file=$inp_file."."."out";
$discard_file=$inp_file."."."discard";
# Defining File handles for input and output files
open (INPUT, "<$inp_file") || die "Can't open file for reading";
open (OUTPUT,">$output_file") || die "Can't open the output file for writing";
# Printing header names in the first line of output file
print OUTPUT "$extra_attr"."|";
foreach $column_name (@col_names) {
print OUTPUT "$column_name"."|";
}
print OUTPUT "\n";
# Loop to itertate through each line of input file
while(<INPUT>)
{
$elements_count = 0;
# This processing is for the first line (containing headers).
# We re-order the columns as per the order defined in array @col_names
if($file_line_count == 1)
{
@line_items = split(/\|/);
$header_count = @line_items;
$file_line_count=$file_line_count +1;
# Prepare a hash map between headercolumns in the input file
# and its index in the the col array
for ($i=0; $i<$header_count - 1;$i++)
{
for ($j=0; $j<=$#col_names ;$j++) {
if($line_items[$i] eq $col_names[$j] ) {
$HashOrder{$col_names[$j]} = $i;
}
}
}
# Assign "" to all those columns which dont contain values
for ($j=0; $j<=$#col_names ;$j++) {
if ($HashOrder{$col_names[$j]} != 0 && !($HashOrder{$col_names[$j]}) ) {
$HashOrder{$col_names[$j]} = "";
}
}
next;
}
#Get the current line
$cur_file_line=$_;
# Remove the spaces from beginning and end of the current line
# This is similar to trimming the line
$cur_file_line =~ s/^\s//;
$cur_file_line =~ s/\s$//;
$file_line="";
if ($cur_file_line ne "")
{
@line_items = split(/\|/);
$elements_count = @line_items;
# If the number of values in the line matches with the number of headers,
# then we store the line in the output file
if($elements_count == $header_count)
{
#rearrange according to the column positions
for ($j=0; $j<=$#col_names ;$j++) {
if($HashOrder{$col_names[$j]} ne "")
{
$file_line = $file_line.$line_items[$HashOrder{$col_names[$j]}];
}
$file_line = $file_line."|";
}
# $extra_attr is prefixed in every line and then line
# is printed in output file
$file_line = $extra_attr."|".$file_line."\n";
print OUTPUT "$file_line";
}
# If the number of values in the line does not match with the number of headers,
# then we store the line in the discard output file
else
{
# The message below (The following.... ) is printed only once in
# the discard output file. This is taken care of by value of $flag.
if ($flag == 0)
{
open (DISCARD,">$discard_file") || die "Can't open the discard file for writing";
print DISCARD "The following lines failed the field count validation \n";
$flag =1;
}
# Printing the line of discard data in discard output file
print DISCARD "$cur_file_line"."\n";
}
} #End of ($cur_file_line ne "")
} #End of While
# Closing file handles
close(OUTPUT);
if($flag == 1)
{
close(DISCARD);
}
close(INPUT);