Thursday, May 17, 2012

Primitive XML Parser in Perl

Sometimes you need to parse a very simple XML file (config for your own project). In cases like that you may like to use something quick an dirty, just to get the content. Below is a VERY PRIMITIVE XML parser.
Your config might look like the below
<unit name="sometest">

<package>
TestHarness
</package>

<targets>
 <hosts location="DC1">
   myhostdc11.mynetwork.net, myhostdc12.mynetwork.net
 </hosts>
 <hosts location="DC2">
   myhostdc21.mynetwork.net, myhostdc22.mynetwork.net
  </hosts>
</targets>

<svn>
  http://subversion.ny.mynetwork.net/svn/repos/MYPROJECT
</svn>

<deployment>
TAG_NUMBER=$1

tar -zxvf TestHarness_${TAG_NUMBER}.tar.gz
ln -sfn TestHarness_${TAG_NUMBER} testharness

cd ./testharness
chmod  -R 777 *

cd ./scripts
dos2unix *
chmod 777 *

cd ../data
dos2unix *
chmod 777 *

</deployment>
</unit>

The Perl script to parse this will look like below (NOTE I had to change formatting a little to fit this page).

#!/usr/bin/perl

{

 $num_args = $#ARGV + 1;
 if ($num_args != 4) {
   print "\nUsage: " . 
  "configurator.pl configfile_name project_name " . 
  "datacenter [script|hosts|vars]\n";
  exit;
 }

 $conf_file=$ARGV[0];
 $project_name=$ARGV[1];
 $datacenter=$ARGV[2];
 $sectionout=$ARGV[3];

 my @project_config;
 my $onestring = "";

 open (CONFFILE, $conf_file) or die("Unable to open file");
 $inproject=0;
 while (<CONFFILE>) {
  chomp;
    push(@file_data,$_);
 }

 my @project_config = &getcontentforattr(\@file_data, "unit",
       "name", $project_name );


 my (@content);
 
 if($sectionout eq "script"){
     my @content = &getcontent(\@project_config, "deployment");
     foreach $temp (@content){
       print $temp . "\n";
     }
 }

 if($sectionout eq "vars"){
    my $svnlocation =&getsvnlocation(\@project_config);
    print "SVNLOCATION=" . ltrim($svnlocation) . "\n";
 
    my $packagename = &getpackagename(\@project_config);
    print "PACKAGENAME=" . $packagename . "\n";
 }
 
 if($sectionout eq "hosts"){
    my @projhosts = &gethosts(\@project_config, $datacenter);
    foreach $host (@projhosts){
       print $host . "\n";
    }
 }
 
 close (CONFFILE); 
}
####################
sub getdeploymentscript
{
  my(@proj_conf) = @{$_[0]};
 
  my @output = &getcontent(\@proj_conf, "deployment");
  
  return (@output);
}
#######################
sub getsvnlocation
{
  my(@proj_conf) = @{$_[0]};
  
  my $strout = "";
  my @output = &gettrimcontent(\@proj_conf, "svn");
  foreach  $temp (@output){
       $strout = $strout . $temp;
  }
  
  return ($strout);
}
##########################
sub getpackagename
{
  my(@proj_conf) = @{$_[0]};

  my $strout = "";
  my @output = &gettrimcontent(\@proj_conf, "package");
  foreach  $temp (@output){
       $strout = $strout . $temp;
  }

  return ($strout);
}
#############################################
sub gethosts
{
  my(@proj_conf) = @{$_[0]};
  my($dc) = $_[1];

  my @section_targets = &getcontent(\@proj_conf, "targets");
  
  my @section_hosts = &getcontentforattr(\@section_targets, 
                                          "hosts",
                                          "location", $dc );
  
  my (@hostsout);
  foreach $hostlongline (@section_hosts){
     my @hoststmp = split(/,/,$hostlongline);
     foreach $host ( @hoststmp){
         if(length($host) > 0){
            push(@hostsout,trim($host));
         }
     }  
  }

  return (@hostsout);
}
#############################################
# parser functions
#############################################
sub gettrimcontent
{
  my (@section_in) = @{$_[0]};
  my $tag = $_[1];
  
  my (@content_out);
  my @content = &getcontent(\@section_in, $tag);
  foreach $temp (@content){
     if( length($temp) > 0 ){
        push(@content_out, trim($temp));
     }
  }

  return (@content_out);
}
############################################
sub getcontent
{
  my (@section_in) = @{$_[0]};
  my $tag = $_[1];
  
  my (@content);
  
  my $insection = 0;
  my $tagline = 0;
  foreach $sectionline (@section_in){

    $tagline = 0;
    if ( $insection == 0 && $sectionline =~ /<$tag *.*>/  ){
       $insection=1;
       $tagline = 1;     
       $sectionline = substr($sectionline,($+[0])+1);
    }
    
    if ( $insection == 1 && $sectionline =~ /<\/$tag *>/ ){
       $insection=0;
       $tagline = 1;
       $sectionline=substr($sectionline,0,($-[0]));
       if(length(trim($sectionline))>0 ) {
         push(@content, $sectionline);
       }
    }
    
    if( $insection == 1 ){
        if ($tagline == 1 && length(trim($sectionline)) > 0){
           push(@content,$sectionline);
        }
        elsif ($tagline == 0){
           push(@content,$sectionline);
        }
    }
    
  }
  return (@content);
}
#############################################
sub getcontentforattr
{
  my (@section_in) = @{$_[0]};
  my $tag = $_[1];
  my $attr = $_[2];
  my $attrvalue = $_[3];
  
  my (@content);
  
  my $insection = 0;
  my $tagline = 0;
  foreach $sectionline (@section_in){

    $tagline = 0;
    if ( 
        $insection == 0 && 
        $sectionline =~ /<$tag .*$attr=\"$attrvalue\".*>/
    ){

       $insection=1;
       $tagline = 1;     
       $sectionline = substr($sectionline,($+[0])+1);
    }
    
    if ( $insection == 1 && $sectionline =~ /<\/$tag *>/ ){
       $insection=0;
       $tagline = 1;
       $sectionline=substr($sectionline,0,($-[0]));
       if(length(trim($sectionline))>0 ) {
         push(@content, $sectionline);
       }
    }
    
    if( $insection == 1 ){
        if ($tagline == 1 && length(trim($sectionline)) > 0){
           push(@content,$sectionline);
        }
        elsif ($tagline == 0){
           push(@content,$sectionline);
        }
    }
    
  }
  return (@content);
  
  
}
#############################################
##### Service functions
#############################################
sub trim($)
{
 my $string = shift;
 $string =~ s/^\s+//;
 $string =~ s/\s+$//;
 return $string;
}
# Left trim function to remove leading whitespace
sub ltrim($)
{
 my $string = shift;
 $string =~ s/^\s+//;
 return $string;
}
# Right trim function to remove trailing whitespace
sub rtrim($)
{
 my $string = shift;
 $string =~ s/\s+$//;
 return $string;
}