package shell_var_subs;
###############################################################################
# shell_var_subs provides functions var_find, var_replace, var_insert,
# var_remove and nested_delim_regex. The var_* functions will
# find/replace/insert/remove variable definitions of the form vname=value
# from a shell script.
#
# Variables that may be used or set externally are:
# $Verbose             ...in functions var_find, var_replace, var_remove
# $Quote_all_vval      ...in function var_replace
# $Quote_space_vval    ...in function var_replace
# $Quote_number_vval   ...in function var_replace
#
# Larry Solheim
#
# $Id: shell_var_subs.pm 655 2011-08-04 23:04:36Z acrnrls $
###############################################################################
require Exporter;
@ISA    =qw(Exporter);
@EXPORT    = qw(var_find var_replace var_remove var_insert nested_delim_regex
  find_all_vars rm_shell_comments $Quote_all_vval $Quote_number_vval $Quote_space_vval);
@EXPORT_OK = qw(var_find var_replace var_remove var_insert nested_delim_regex
  find_all_vars rm_shell_comments $Quote_all_vval $Quote_number_vval $Quote_space_vval);

sub find_all_vars {my ($sh_script, $OPTS) = @_;
  use strict;
  my %RET;
  my $verbose = 0;
  if (defined $OPTS->{VERBOSE}) {$verbose = $OPTS->{VERBOSE}};

  # remove comments from the script
  my $tscript = rm_shell_comments($sh_script);
  if ($verbose > 1) {print "find_all_vars: script with comments removed\n$tscript\n"};

  # get a list of all shell variable names in $tscript
  my @vlist = $tscript =~ /\b(\w+)=/g;

  # remove duplicates from the list
  my @V2 = @vlist;
  foreach my $v (@V2) {
    my $found = 0;
    @vlist = grep {$found+=/^$v$/; if (!/^$v$/ or ($found == 1)){1}else{0}} @vlist;
  };
  if ($verbose > 0) {print "find_all_vars: vlist :: @vlist\n"};

  # determine all unique values for each vname in vlist
  foreach my $vname (@vlist) {
    my @vals = var_find($tscript, $vname);
    # Return all values for each variable
    $RET{$vname} = [ @vals ];
    if ($verbose > 0) {print "find_all_vars: values for $vname :: @vals\n"};
  };
  return %RET;
};

sub var_find {my ($sh_script, $vname) = @_;
  use strict;
  # return all unique values of vname=value pairs found in a shell script
  # or false (null value) if vname is not found in $str

  return () unless $sh_script;
  return () unless $vname;

  # Always add a newline to the input shell script
  # The last value may be missed if the input script
  # does not end with a newline
  my $str = "$sh_script"."\n";

  my @VALS = ();
  my $cs;
  my $last_val;
  my $found;
  my $vval;

  # strip comments from the input string
  $str = rm_shell_comments($str);

  # If the variable name does not exist in the string then return an empty list
  unless ($str =~ /\b${vname}=/) {return ()};

  # values are backtic delimited
  my $bt = q@`[^`]*`@;

  # values are $(....) command/arithmetic substitution constructs
  # define a regular expression to match nested parentheses
  my $nested_paren = nested_delim_regex('(',')',5);
  $cs = '\$'."$nested_paren";

  # values are single quote delimited
  my $sq = q@'[^']*'@;

  # values are double quote delimited
  my $dq = q@"[^"]*"@;

  # values are terminated by a colon or white space
  my $sw = q@[^\s;]*?(?=[\s;])@;

  # null values
  my $nv = '[\s;]';

  push @VALS, $str =~ /\b${vname}=($bt|$cs|$sq|$dq|$sw|$nv)/gs;

  # Clean up values to be returned
  foreach (@VALS) {
    s/^$nv$//s;     # remove white space and/or semicolon from null values
    s/\s+$//;       # strip trailing white space
    s/^"(.*)"$/$1/; # strip double quotes
    s/^'(.*)'$/$1/; # strip single quotes
  };

  # When there is at least 1 element assigned in VALS remove all instances
  # of the last value assigned in the list. This last value will be
  # pushed onto VALS after duplicates are removed from the list ensuring
  # that it remains the last value.
  if (scalar(@VALS)) {
    $last_val = $VALS[$#VALS];
    @VALS = grep !/^$last_val$/, @VALS
  };

  # remove duplicate values from the list
  my @V2 = @VALS;
  foreach $vval (@V2) {
    $found = 0;
    @VALS = grep {$found+=/^$vval$/; if (!/^$vval$/ or ($found == 1)){1}else{0}} @VALS;
  };
  if (defined $last_val) {push @VALS, $last_val};
  if ($main::Verbose > 10) {foreach (@VALS) {print "var_find: $vname=$_\n"}};

  return @VALS;
}

sub var_replace { my ($str, $vname, $vval) = @_;
  use strict;
  # replace a vname=value variable definition in a shell script
  # The (possibly) modified string is returned together with a flag to
  # indicate success or failure of the replace operation.
  # Various forms of vname=value are checked for as indicated below.

  # if either $str or $vname are untrue (undefined, null or zero)
  # or $vval is undefined (null or zero values are allowed)
  # then simply return the original string with a false return value
  return ($str,0) unless ($str and $vname and defined $vval);

  # Note: if vval is defined but null then the shell script variable will
  # get a null value which may or may not be what is desired. To avoid null
  # values in the shell script check that vval is not null prior to the call.

  # Setting the global variable $Quote_all_vval true will cause all
  # replaced variable values to be quoted
  my $quote_all_vval = 0;
  $quote_all_vval = $main::Quote_all_vval if defined $main::Quote_all_vval;

  # spaces should always be quoted, however this may be overridded by
  # setting the global variable $Quote_space_vval
  my $quote_space_vval = 1;
  $quote_space_vval = $main::Quote_space_vval if defined $main::Quote_space_vval;

  # It is not nessecary to quote numeric values but may be desireable
  # in certain circumstances.
  # To enable/disable set the global variable $Quote_number_vval true/false
  my $quote_number_vval = 0;
  $quote_number_vval = $main::Quote_number_vval if defined $main::Quote_number_vval;

  # Vtrip  is the lowest value of Verbose that will
  # produce debugging output from this subroutine
  my $Vtrip=3;

  my $vv;
  my $found;
  my $n;

  # Always quote values of certain variables
  # Any variable name that ends in "_prefix_list" gets single quoted
  if ( $vname =~ /^\s*\w+_prefix_list\s*$/ ) {
    # If the value is already quoted, do nothing
    unless ($vval =~ /^".*"$/ or $vval =~ /^'.*'$/) {
      if ($vval !~ /'/) {$vval = "'" . "$vval" . "'"}
      elsif ($vval !~ /"/) {$vval = '"' . "$vval" . '"'}
      else {print "var_replace: Unquoted value in $vname=$vval\n"};
    }
  }

  # Special treatment for any variable that ends in "_file_list_opts"
  if ( $vname =~ /^\s*\w+_file_list_pattern\s*$/ ) {
    # If the value is already quoted, do nothing
    unless ($vval =~ /^".*"$/ or $vval =~ /^'.*'$/) {
      if ($vval !~ /'/) {$vval = "'" . "$vval" . "'"}
      elsif ($vval !~ /"/) {$vval = '"' . "$vval" . '"'}
      else {print "var_replace: Unquoted value in $vname=$vval\n"};
    }
  }

  # quote values that contain shell meta characters
  my $quote_shmeta_vval = 1;
  if ($quote_shmeta_vval and $vval =~ /[\Q][*)(?!\E]/) {
    # If the value is already quoted, do nothing
    unless ($vval =~ /^".*"$/ or $vval =~ /^'.*'$/) {
      if ($vval !~ /'/) {$vval = "'" . "$vval" . "'"}
      elsif ($vval !~ /"/) {$vval = '"' . "$vval" . '"'}
      else {print "var_replace: Unquoted shell meta chars in $vname=$vval\n"};
    }
  }

  # ensure that white space in the value is quoted if requested
  if ($quote_space_vval and $vval =~ /\s/) {
    # If the value is already quoted, do nothing
    unless ($vval =~ /^".*"$/ or $vval =~ /^'.*'$/) {
      if ($vval !~ /"/) {$vval = '"' . "$vval" . '"'}
      elsif ($vval !~ /'/) {$vval = "'" . "$vval" . "'"}
      else {print "var_replace: Unquoted  white space in $vname=$vval\n"}
    }
  }

  # quote values that are integer or floating point numbers, if requested
  if ($quote_number_vval and $vval =~ /^ *[+-]?\d+\.?\d*([EeDd][+-]?\d+)* *$/) {
    # If the value is already quoted, do nothing
    unless ($vval =~ /^".*"$/ or $vval =~ /^'.*'$/) {
      if ($vval !~ /"/) {$vval = '"' . "$vval" . '"'}
      elsif ($vval !~ /'/) {$vval = "'" . "$vval" . "'"}
      else {print "var_replace: Unquoted number in $vname=$vval\n"};
    }
  }

  # quote all values if requested
  if ($quote_all_vval) {
    # If the value is already quoted, do nothing
    unless ($vval =~ /^".*"$/ or $vval =~ /^'.*'$/) {
      if ($vval !~ /"/) {$vval = '"' . "$vval" . '"'}
      elsif ($vval !~ /'/) {$vval = "'" . "$vval" . "'"}
      else {print "var_replace: Unquoted value in $vname=$vval\n"};
    }
  }

  # Each of the following substitutions must be done in the order shown below
  # so that multiple instances of a variables with different values will
  # be dealt with appropriately

  # values are backtic delimited
  my $bt = q@`[^`]*`@;

  # values are $(....) command/arithmetic substitution constructs
  # define a regular expression to match nested parentheses
  my $nested_paren = nested_delim_regex('(',')',5);
  my $cs = '\$'."$nested_paren";

  # values are single quote delimited
  my $sq = q@'[^']*'@;

  # values are double quote delimited
  my $dq = q@"[^"]*"@;

  # values are enclosed in parentheses (...) to a depth of 2
  # my $ep = nested_delim_regex('(',')',2);

  # values are enclosed in braces {...} to a depth of 2
  # my $eb = nested_delim_regex('{','}',2);

  # values are enclosed in square brackets [...] to a depth of 2
  # my $es = nested_delim_regex('[',']',2);

  # values are terminated by a colon or white space
  my $sw = q@[^\s;]*?(?=[\s;])@;

  # put it all together
  # my $vvregex = "$bt|$cs|$sq|$dq|$ep|$eb|$es|$sw";
  my $vvregex = "$bt|$cs|$sq|$dq|$sw";

  # Replace all values with any of the above forms
  my @ovals = ();
  if ($main::Verbose > $Vtrip) {
    @ovals = $str =~ /[^-]\b${vname}=($vvregex)/gs;
    # remove duplicate values from the list
    my @V2 = @ovals;
    foreach $vv (@V2) {
      $found = 0;
      @ovals = grep {$found+=/^\Q$vv\E$/;if(!/^\Q$vv\E$/ or ($found==1)){1}else{0}} @ovals;
    }
  }
  my $rval = ($str =~ s/([^-])\b${vname}=($vvregex)/$1${vname}=${vval}/gs);

  if ($main::Verbose > $Vtrip) {
    if ($rval) {
      printf "The assignment %10s=%s\n",$vname,$vval;
      $n = 0;
      foreach (@ovals) {
        $n++;
        if ($_ or $_ eq '') {
          printf "  %8s     %10s=%s\n",($n==1)?"replaces":"and",$vname,$_
        }
      }
    }
  }

  if ($rval and $main::Verbose > 1) {print "RESET ${vname}=${vval}\n"};
  if (not $rval and $main::Verbose > 1) {print "${vname}=${vval} NOT found\n"};
  return ($str, $rval);
}

sub var_insert { my ($str, $instr, $OPTS) = @_;
  use strict;
  # Insert a string $instr (could be a vname=value variable assignment)
  # into the shell script found in $str
  # The (possibly) modified string in returned together with a flag to
  # indicate success or failure of the remove operation.

  my $Vtrip = 0;
  my $verbose = 0;
  if (defined $OPTS->{VERBOSE}) {$verbose = $OPTS->{VERBOSE}};

  # If the insert string is of the form vname=value and vname is already a
  # variable name in the input string then return the unmodified input string
  # with a return value of 0 unless asked to force the insert via $force
  my $force = 0;
  if (defined $OPTS->{FORCE}) {$force = $OPTS->{FORCE}};
  unless ($force) {
    my $vname='';
    my $vval='';
    ($vname,$vval) = split '=', $instr,2;
    unless ($vname eq $instr) {
      # The input string is of the form vname=value
      if ($str =~ /\b${vname}=/) {
        warn "var_insert: $vname already exists in the input string.
          $instr is NOT inserted\n";
        return ($str,0);
      };
    };
  };

  # Initialize the return value
  my $rval = 0;

  # First check for a tag of the form <<INSERT>> or <<INSERT n>>
  my $locF = -1;
  my $insert_at_tag = 0;
  if (defined $OPTS->{TAG}) {$insert_at_tag = 1*$OPTS->{TAG}};
  if (not defined $insert_at_tag or $insert_at_tag == 0 or $insert_at_tag == -1) {
   if ($str =~ s/^(.*<<\s*INSERT\s*0*\s*>>.*)$/$1\n  $instr/gm) {$rval = $locF};
    if ($verbose > $Vtrip and $rval) {
      print "var_insert: ...ADDED $instr after every <<INSERT>> tag\n"
    };
  } else {
    my @TAG_in_string = sort ($str =~ /^.*(<<\s*INSERT\s*[0-9]*\s*>>).*$/gm);
    foreach my $tag (@TAG_in_string) {
      my ($tag_no) = ($tag =~ /<<\s*INSERT\s*([0-9]+)*\s*>>/);
      $tag_no = 0 unless $tag_no;
      if ($insert_at_tag == $tag_no) {
        if ($str =~ s/^(.*$tag.*)$/$1\n  $instr/gm) {$rval = $tag_no};
        if ($verbose > $Vtrip and $rval) {
          print "var_insert: ...ADDED $instr after every <<INSERT $tag_no >> tag\n"
        };
      };
    };
  };

  my $locA = -2;
  if ($insert_at_tag == $locA or ($insert_at_tag == 0 and not $rval)) {
    # Insert after every line of the form /#.*Parmsub Parameters/
    if ($str =~ s/^(#.*Parmsub Parameters.*)$/$1\n  $instr/gm) {$rval = $locA};
    if ($verbose > $Vtrip and $rval) {
      print "var_insert: ...ADDED $instr after every \/#\.\*Parmsub Parameters\/ line\n"
    };
  };

  my $locB = -3;
  if ($insert_at_tag == $locB or ($insert_at_tag == 0 and not $rval)) {
    # Insert at the top of every parmsub section
    if ($str =~ s/^( *### *parmsub.*)$/$1\n  $instr/gm) {$rval = $locB};
    if ($verbose > $Vtrip and $rval) {
      print "var_insert: ...ADDED $instr to every parmsub section\n"
    };
  };

  my $locC = -4;
  if ($insert_at_tag == $locC or ($insert_at_tag == 0 and not $rval)) {
    # Insert in the Execute_Script doc
    my $rex1 = q@(?:^|\n)( *cat *> *Execute_Script *<<.*?(\n *set *-a[^\n]*\n))@;
    my $rex2 = q@^( *cat *> *Execute_Script *<<.*)$@;
    if ($str =~ s/$rex1/$1\n  # User supplied via cccjob command line\n  $instr\n/s) {
      # Insert after the first set -a found in the Execute_Script doc
      $rval = $locC;
    } elsif ($str =~ s/$rex2/$1\n  $instr/m) {
      # Insert at top of the Execute_Script doc
      $rval = $locC;
    }
    if ($verbose > $Vtrip and $rval) {
      print "var_insert: ...ADDED $instr to the Execute_Script doc\n"
    }
  }

  my $locD = -5;
  if ($insert_at_tag == $locD or ($insert_at_tag == 0 and not $rval)) {
    # Insert at top of the Model_Input doc
    if ($str =~ s/^( *cat *> *Model_Input *<<.*)$/$1\n  $instr/m) {$rval = $locD};
    if ($verbose > $Vtrip and $rval) {
      print "var_insert: ...ADDED $instr to the Model_Input doc\n"
    };
  }

  my $locE = -6;
  if ($insert_at_tag == $locE or ($insert_at_tag == 0 and not $rval)) {
    # If the job has "set -a" on the first executable line or "set -a" as the first
    # executable line and ". betapath2" as the second executable line then add the
    # vname=value after that(those) line(s).
#    if ($str =~ /^( *#.*\n|^ *\n)* *set -a.*\n( *#.*\n|^ *\n)* *\. *betapath2(\s*|\s*#.*)\n/s) {
    if ($str =~ /^(\s*#.*|^\s*)\n*\s*set\s+-a.*\n( *#.*\n|^ *\n)* *\. *betapath2(\s*|\s*#.*)\n/s) {
      # Insert after the ". betapath2" line
      if ($str =~ s/^(\s*\.\s*betapath2(\s*|\s*#.*))$/$1\n  $instr/m) {$rval = $locE};
      if ($verbose > $Vtrip and $rval) {
        print "var_insert: ...ADDED $instr after . betapath2 line\n"
      };
    }
    elsif ($str =~ /^( *#.*\n|^ *\n)* *set -a.*\n/s) {
      # Insert after the "set -a" line
      if ($str =~ s/^( *set -a.*)$/$1\n  $instr/m) {$rval = $locE};
      if ($verbose > $Vtrip and $rval) {
        print "var_insert: ...ADDED $instr after set -a line\n"
      };
    }
    elsif ($str =~ s/^(((\s*#.*|\s*)\n)*)/$1\n  $instr\n/m) {
      # Otherwise insert as the first executable line
      $rval = $locE;
      if ($verbose > $Vtrip and $rval) {
        print "var_insert: ...ADDED $instr at top of script\n"
      };
    }
    else {
      # This should never happen, but just in case...
      die "var_insert: ...failed to insert $instr. Stopped";
    };
  };

  return ($str, $rval);
}

sub var_remove { my ($sh_script, $vname, $OPTS) = @_;
  use strict;
  # Remove all variable definitions $vname=... from a shell script.
  # The (possibly) modified string in returned together with a flag to
  # indicate success or failure of the remove operation.
  # Various forms of vname=value are checked for as indicated below.
  return ($sh_script, 0) unless (defined $sh_script and defined $vname);

  my $verbose = 0;
  if (defined $OPTS->{VERBOSE}) {$verbose = $OPTS->{VERBOSE}};

  my $strip_comments = 0;
  if (defined $OPTS->{STRIP_COMMENTS}) {$strip_comments = $OPTS->{STRIP_COMMENTS}};

  # Always add a newline to the input shell script
  # The last value may be missed if the input script
  # does not end with a newline
  my $str = "$sh_script"."\n";

  # strip comments from the input string when requested
  if ($strip_comments) {$str = rm_shell_comments($str)};

  # values are backtic delimited
  my $bt = q@`[^`]*`@;

  # values are $(....) command/arithmetic substitution constructs
  # define a regular expression to match nested parentheses
  my $nested_paren = nested_delim_regex('(',')',5);
  my $cs = '\$'."$nested_paren";

  # values are single quote delimited
  my $sq = q@'[^']*'@;

  # values are double quote delimited
  my $dq = q@"[^"]*"@;

  # values are terminated by a colon or white space
  my $sw = q@[^\s;]*?(?=[\s;])@;

  # Remove all values with any of the above forms
  my $rval = ($str =~ s/\b${vname}=($bt|$cs|$sq|$dq|$sw) *;?//gs);

  if ($rval and $verbose > 1) {print "REMOVED ${vname}\n"};
  return ($str, $rval);
}

sub rm_shell_comments {my ($str, $OPTS) = @_;
  use strict;
  # Remove all "#" delimited comments and blank lines from a shell script

  my $verbose = 0;
  if (defined $OPTS->{VERBOSE}) {$verbose = $OPTS->{VERBOSE}};

  # regex to find anything that is not a hash mark
  my $nh = q@[^#]*@;

  # regex to find valid script followed by backtic delimited strings
  my $bt = $nh . q@`[^`]*`@;

  # regex to find strings within parentheses to a nested depth of 3
  my $paren = '\(([^)(]|\(([^)(]|\(([^)(]|\([^)(]*\))*\))*\))*\)';
#  my $paren = '\([^)(]*\)';

  # regex to find values that are $(....) command/arithmetic substitution
  # constructs and are preceeded by valid script
  my $cs = $nh . '\$' . "$paren";

  # regex to find valid script followed by single quote delimited strings
  my $sq = $nh . q@'[^']*'@;

  # regex to find valid script followed by double quote delimited strings
  my $dq = $nh . q@"[^"]*"@;

  # remove all comments from the string
  # hash marks within single or double quotes as well as
  # within backtic or $(....) delimited text are ignored
#  $str =~ s/(($cs|$bt|$sq|$dq|$nh)*)[^\n]*\n/$1\n/gs; # quotes extend across multiple lines
#  $str =~ s/(($cs|$bt|$sq|$dq|$nh)*).*/$1/gm; # quotes are limited to 1 line
  $str =~ s/(($cs|$bt|$sq|$dq)*$nh).*/$1/gm; # quotes are limited to 1 line

  # remove all blank lines
  $str =~ s/\s*\n/\n/sg;

  if ($verbose > 0) {print "String returned from rm_shell_comments:\n$str\n"};

  return ($str);
};

sub nested_delim_regex { my ($left_delim, $right_delim, $depth, $OPTS) = @_;
  use strict;
  # This subroutine will return a regular expression that will match whatever is
  # between a set of left and right delimiters, including the enclosing delimiters.
  # The delimiters may be nested to "depth" but be warned that setting depth to a
  # large number will result in a very large regular expression.
  # The default delimiters are parenthese "(" and ")" and the default depth is 1.
  # Currently, only single character delimiters are allowed.
  #
  # Note: when left and right delimiters are the same character nesting is not
  # possible. In this case simply use a regex like $delim[^$delim]*$delim to match
  # a string enclosed between pairs of the single character $delim.
  #
  # Here are a few examples of regular expressions generated for nested paretheses
  # parentheses depth=0 :: \([^)(]*\)
  # parentheses depth=1 :: \(([^)(]|\([^)(]*\))*\)
  # parentheses depth=2 :: \(([^)(]|\(([^)(]|\([^)(]*\))*\))*\)
  # parentheses depth=3 :: \(([^)(]|\(([^)(]|\(([^)(]|\([^)(]*\))*\))*\))*\)

  # Set optional parameters
  my $verbose = 0;
  if (defined $OPTS->{VERBOSE}) {$verbose = $OPTS->{VERBOSE}};

  my $BR_all_nested_delim = 0;
  if (defined $OPTS->{BACKREF}) {$BR_all_nested_delim = $OPTS->{BACKREF}};

  $left_delim='('  unless $left_delim;
  $right_delim=')' unless $right_delim;
  $depth=1         unless defined $depth; # defined $depth will allow depth=0
  if ($depth < 0) {
    print "nested_delim_regex: depth=$depth is out of range\n";
    return '';
  };
  my $nl = length $left_delim;
  my @L  = unpack 'A' x $nl, $left_delim;
  my $nr = length $right_delim;
  my @R  = unpack 'A' x $nr, $right_delim;
  if ($nl > 1) {
    print "nested_delim_regex: A delimiter longer than 1 char was found. ";
    print "$left_delim truncated to $L[0]\n";
    $left_delim = $L[0];
  };
  if ($nr > 1) {
    print "nested_delim_regex: A delimiter longer than 1 char was found. ";
    print "$right_delim truncated to $R[0]\n";
    $right_delim = $R[0];
  };
  # Normally there will be no back reference except for the outer loop
  # to have the regex back reference each nested match set $nobr null
  # by setting the global variable $BR_all_nested_delim true
  my $nobr='?:';
  $nobr='' if $BR_all_nested_delim;
  # eg: $nested_paren = '\(' . '([^()]|\(' x $depth . '[^()]*' . '\))*' x $depth . '\)';
  my $p1 = "\\$left_delim";
  my $p2 = '('. $nobr .'[^' . "$right_delim$left_delim" . ']|' . "\\$left_delim"; # x $depth
  my $p3 = '[^' . "$right_delim$left_delim" . ']*';
  my $p4 = "\\$right_delim" . ')*'; # x $depth
  my $p5 = "\\$right_delim";
  my $regex = $p1 . $p2 x $depth . $p3 . $p4 x $depth . $p5;
  return $regex;
};
