#!/usr/bin/perl

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Yerna Lindale.
#
# The Initial Developer of the Original Code is
# the Software Engineering Lab, INTEC, University Ghent.
# Portions created by the Initial Developer are Copyright (C) 2004
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Kris De Schutter <kris.deschutter@ugent.be>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****

# This script preprocesses a (normalized) Cobol source program.
#
# We inline all COPY's present in a (normalized) Cobol source program.
#
# This tool is part of "Yerna Lindale" (old music).
#
# @author Kris De Schutter <kris.deschutter@ugent.be>
# @date Wed. August 20, 2003

# Pitfalls / Problems:
#
# Handling replacements is a big pain, and will most likely have some
# bugs. I especially think the following case is treated wrongly:
#   "Each of the separators comma (, ), semicolon (; ) or space in
#   pseudo-text-1 or in the library text is treated as a single space.
#   Each string of one or more spaces is treated as a single space."
#
# COPY statements which use semi-quoted strings containing a dot.
# Semi-quotes are not (yet) considered when checking whether a dot is
# inside a literal.
#
# Finding the end of an unbalanced literal does not (yet) take into
# account quotes which have been escaped.

use Getopt::Long;

# Options...
$help = 0;
$version = 0;
$quiet = 0;
$file_in = '';
$file_out = '';
# @includes = ("portfolio/lcm/copys/");

# Process command line options...
GetOptions (
  'help|?' => \$help,
  'version' => \$version,
  'quiet' => \$quiet,
  'in:s' => \$file_in,
  'out:s' => \$file_out,
  'include:s' => \@includes
) or usage ();

# Show help if requested...
if ($help) { usage (); }

# Show version info if requested...
if ($version) { version (); }

# Separate all include paths...
@includes = split(/:/,join(':',@includes));

# If a filename was passed, get the local dir for that file
# and add it to the front of the library list...
if ($file_in) {
  @path_for_in_split = split (/\//, $file_in);
  $path_for_in_split [@path_for_in - 1] = "";
  $path_for_in_joined = join ('/', @path_for_in_split);
  @includes = ($path_for_in_joined, @includes);
}

# DEBUG: foreach $inc (@includes) { print STDERR ("including $inc\n"); }

# Prepare input stream...
$fh_in = 'STDIN';
if ($file_in) {
  open (HANDLE_IN, $file_in) or file_not_found ($file_in);
  $fh_in = 'HANDLE_IN';
}

# Prepare output stream...
$fh_out = 'STDOUT';
if ($file_out) {
  open (HANDLE_OUT, ">$file_out") or cannot_open_file ($file_out);
  $fh_out = 'HANDLE_OUT';
}


# Variables for bookkeeping...
$line_count = 0;
$found_copy_count = 0;
$copys_with_replacing_count = 0;
$pseudotext_count = 0;


# Let's do some real work...

# Read a line from input...
$not_at_end = ($line = <$fh_in>);
# Loose trailing newlines...
chomp ($line);
# Bookkeeping...
$line_count++;
# We're at the start of a new line...
$new_line = 1;

# While more input is available...
while ($not_at_end) {

  if ($new_line and $line =~ /^\s{6}\*/ ) {
  	# Comment. (Only checked for at the start of a new line.)
    # Pass on...
    print $fh_out ("$line");
    $line = "";

  } elsif ($line =~ /(\s|^)COPY(\s|$)/i) {
  	
    # The current line (possibly) holds the start of a COPY statement.

    # Keep track of these matches...    
    $line_start = "$`";
    $copy = "$&";
    $line_end = "$'";

  	# Ensure COPY is not in string literal...

    if (is_balanced ($line_start) != 0) {
      # We found COPY to be inside a literal.
      # First we output the COPY we found...

      print $fh_out ("$line_start");
      print $fh_out ("$copy");

      # Then we look for the remainder of the literal...
      $line_end =~ /^[^${last_quote}]*${last_quote}/;         # bug: \" or \'

      # We output that remaining part...
      print $fh_out ("$&");

      # And set up what's next to check...
      $line = "$'";
        
      # On to the next iteration...
      next;
    }

    # -- Get full COPY statement... ---------------------------------

    # We need to the find the end of the COPY statement.
    $looking_for_end = 1;
    $line = "$line_end";

    while ($looking_for_end) {

      while ($looking_for_end and $line ne "") {
      	# Scanning line...
        if ($line =~ /\./) {
          # Possible closing dot.
          $copy_start = "$`";
          $copy_end = "$'";
    
          # Check balance of brackets...
          if (is_balanced ($copy_start) == 0) {
            # Balanced brackets: closing dot found.
            # Gather up the results...
            $copy = "$copy$`.";
            $line_end = "$'";
            # Stop searching...
            $looking_for_end = 0;
          } else {
          	# Dot is inside literal.
          	# Close literal, and continue searching...
            $copy = "$copy$copy_start.";
            $copy_end =~ /^[^${last_quote}]*${last_quote}/;    # bug: \" or \'
            $copy = "$copy$&";
            $line = "$'";
          }

        } else {
          # No closing dot in this line.
          # Continue copy...
          $copy = "$copy$line";
          $line = "";
        }
      }

      if ($looking_for_end) {
      	# Copy was not completed in last line.
      	# Read a new line from input...
        if ($line = <$fh_in>) {
          # Loose trailing newlines...
          chomp ($line);
          # Bookkeeping...
          $line_count++;
        } else {
          # We reached end of file before we found a dot.
          # Flag error and exit...
          print STDERR ("Error: end of file before end of COPY.\n");
          exit (-1);
        }
      }

    } # while looking_for_end

    $line = "$line_end";

    # We now have the full copy statement in $copy.

    # -- Parse COPY statement ---------------------------------------

    # We look for the file name...
    $copy_name = "";

    if ($copy =~ /COPY\s+(\"[^\"]+\"|\'[^\']+\'|[^ \"\'\.]+)\s*/i) {
      # File name found.
      $copy_name = "$1";
      $copy = "$'";
      
      # Loose surrounding quotes...
      if ($copy_name =~ /^\"(.*)\"$/) {
        $copy_name = "$1";
      } elsif ($copy_name =~ /^\'(.*)\'$/) {
        $copy_name = "$1";
      }

    } else {
      # No filename in COPY.
      # Flag error and exit...
      print STDERR ("Error: no filename in COPY (line $line_count).\n");
      exit (-1);
    }

    # We look for the library name...
    $library_name = "";
    if ($copy =~ /^(IN|OF)\s+(\"[^\"]+\"|\'[^\']+\'|[^ \"\'\.]+)\s*/) {
      # File name found.
      $library_name = "$2";
      $copy = "$'";
    }

    # We look for a SUPPRESS...
    $suppress = 0;
    if ($copy =~ /^SUPPRESS\s*/) {
      # File name found.
      $suppress = 1;
      $copy = "$'";
    }

    # Make sure we start on a new line, outputting whatever
    # is still left behind...
    if ((not $new_line) or $line_start !~ /^[ ]*$/) {
      print $fh_out ("$line_start\n");
    }

    # Output copy start...
    print $fh_out ("      * \@yl:copy_start $copy_name");
    if ($library_name ne "") { print $fh_out (" in $library_name"); }
    if ($suppress) { print $fh_out (" SUPPRESS"); }
    print $fh_out ("\n");

    # We look for REPLACING...
    $replacement_count = 0;
    @operands = ();
    @replacements = ();
    if ($copy =~ /^REPLACING\s*/) {
      # Found REPLACING.
      $copy = "$'";
      
      while ($copy =~ /([^ \.]*|==[^=]*==)[ ]* BY [ ]*([^ \.]*|==[^=]*==)\s*/) {
        print $fh_out ("      * \@yl:replacing $1 BY $2\n");
        $copy = "$'";

        # Keep track of mappings...
      	$operands[$replacement_count] = "$1";
      	$replacements[$replacement_count] = "$2";

        # Clear replacement from possible pseudotext...
      	if ($replacements[$replacement_count] =~ /^==([^=]*)==$/) {
          $replacements[$replacement_count] = "$1";
        }
      	
      	$replacement_count++;
      }
      
      # Bookkeeping...
      $copys_with_replacing_count++;
    }

    # $copy should now be empty...
    if ($copy !~ /\s*\./) {
      # $copy is not empty.
      # Flag error and exit...
      print STDERR ("Error: unexpected format of COPY ($line_count).\n");
      exit (-1);
    }

    # -- Include COPY in output... ----------------------------------

    # Find file for COPY...
    $full_file_name = &get_full_filename_for_copy ($copy_name, $library_name);

    if ($full_file_name eq "") {
      # File not found.
      # Flag error and exit...
      print STDERR ("Error: could not find copy file for $copy_name");
      if ($library_name ne "") { print STDERR (" in $library_name"); }
      print STDERR (" (line $line_count).\n");
      # exit (-1);
    }

    # Open file for inclusion...
    open (COPY, $full_file_name);
    
    if ($replacement_count == 0) {
      # Nothing to replace.
      # Simply copy all lines...
      @copy_lines = <COPY>;
      print $fh_out @copy_lines;

    } else {
    	
      # We have to do replacements.
      
      # First we prepare the A-operands...
      @swatches = ();
      foreach $swatch (@operands) {

      	# Pseudo-text or not ?
      	if ($swatch =~ /^==([^=]*)==$/) {

      	  # Pseudo-text.
      	  $pseudotext_count++;

          # Remove separators...
          $swatch = "$1";

          # Prepare new keyword list...
          @keyword_list = ();
          $keyword_list [0] = "";

          while ($swatch ne "") {

            # This is the hard part. The pseudotext has to be split into
            # text-words which we can then match against the library.
            # I'll try to stick as close to the definition of text-
            # words as possible. It's complex though, and will most
            # likely (if not definitely) contain errors.
            
            # First: "Text-words are: [...] Separators, except for: space,
            # pseudo-text delimiters, and the opening and closing
            # delimiters for non-numeric literals."

            # I won't bother including the definition of separators
            # here as it is most definitely a nightmare...

            # I have chosen to treat '\n' logically as a space. This is
            # because '.' is not a separator unless followed by a space.
            # Hoewever, in the code I was testing a '.' seemed to also be
            # considered a separator if found at the end of an inputline.
            # Hence the extra $'s in the pattern definitions.
            # I really should check my documentation on end-of-lines...
            
            if ($swatch =~ /^(,( |$)|;( |$)|\.( |$)|\(|\)|:)/) {
              $keyword_list [@keyword_list] = "$1";
              $keyword_list [@keyword_list] = "";
              $swatch = "$'";
            	
            # Second: "Text-words are: [...] Literals including, in the
            # case of non-numeric literals, the opening quotation mark
            # and the closing quotation mark which bound the literal. A
            # string within a non-numeric literal is not a separate
            # text-word."
            # Got that ? Right; let's get to it! There are a lot of
            # cases to cover here...
            } elsif ($swatch =~ /^\"(\\\"|[^\"])*\"/) {
              $keyword_list [@keyword_list] = "$&";
              $keyword_list [@keyword_list] = "";
              $swatch = "$'";
  
            } elsif ($swatch =~ /^\'(\\\'|[^\'])*\'/) {
              $keyword_list [@keyword_list] = "$&";
              $keyword_list [@keyword_list] = "";
              $swatch = "$'";
  
            } elsif ($swatch =~ /^(x|X)\"[0-9a-fA-F]*\"/) {
              $keyword_list [@keyword_list] = "$&";
              $keyword_list [@keyword_list] = "";
              $swatch = "$'";
  
            } elsif ($swatch =~ /^(x|X)\'[0-9a-fA-F]*\'/) {
              $keyword_list [@keyword_list] = "$&";
              $keyword_list [@keyword_list] = "";
              $swatch = "$'";
  
            } elsif ($swatch =~ /^(\+|-)?[0-9]+(\.[0-9]+)?/) {
              $keyword_list [@keyword_list] = "$&";
              $keyword_list [@keyword_list] = "";
              $swatch = "$'";
  
            # Third: "Text-words are: [...] Any other sequence of
            # characters delimited by separators, except comment lines
            # and the word COPY, bounded by separators."
            # No comment lines possible in pseudotext. And I'm gonna
            # ignore the COPY-thing...
            } else {
              # Leading separators...
              $swatch =~ /^( |, |; |\. |:|\(|\)|==)*/;
              $last_spacing = $keyword_list [@keyword_list - 1];
              $keyword_list [@keyword_list - 1] = "$last_spacing$&";
              $swatch = "$'";

              # Text-word...
              $swatch =~ /( |$|,( |$)|;( |$)|\.( |$)|:|\(|\)|==)/;
              $keyword_list [@keyword_list] = "$`";
              $keyword_list [@keyword_list] = "";
              $swatch = "$&$'";
            }
          } # End of while line not empty.

          # Ignore bounding whitespace...
          # (Loosing leading whitespace is required for the matching
          # algorithm to work correctly.)
          shift (@keyword_list);
          pop (@keyword_list);

          # Add to swatches...
          $swatches[scalar @swatches] = \@keyword_list;

      	} else {
      	  
      	  # Single word.
          $swatches[scalar @swatches] = [$swatch];
      	}
      }

      # Now we prepare the copy-file for matching...
      # @library_text will interleave text-words and spacing,
      # starting with spacing.
      @library_text = [];
      $library_text [0] = "";
      while ($library_line = <COPY>) {

        if ($library_line =~ /^\s{6}\*/) {

          # Comment line. These are not text-words.
          # Add to last spacing...
          $last_spacing = $library_text [@library_text - 1];
          $library_text [@library_text - 1] = "$last_spacing$library_line";

        } else {

          while ($library_line ne "\n") {

            # This is the hard part. Lines have to be split into
            # text-words which we can then match against the A-operands.
            # I'll try to stick as close to the definition of text-
            # words as possible. It's complex though, and will most
            # likely (if not definitely) contain errors.
            
            # First: "Text-words are: [...] Separators, except for: space,
            # pseudo-text delimiters, and the opening and closing
            # delimiters for non-numeric literals."

            # I won't bother including the definition of separators
            # here as it is most definitely a nightmare...

            # I have chosen to treat '\n' logically as a space. This is
            # because '.' is not a separator unless followed by a space.
            # Hoewever, in the code I was testing a '.' seemed to also be
            # considered a separator if found at the end of an inputline.
            # Hence the extra $'s in the pattern definitions.
            # I really should check my documentation on end-of-lines...
            
            if ($library_line =~ /^(,( |$)|;( |$)|\.( |$)|\(|\)|:)/) {
              $library_text [@library_text] = "$1";
              $library_text [@library_text] = "";
              $library_line = "$'";
            	
            # Second: "Text-words are: [...] Literals including, in the
            # case of non-numeric literals, the opening quotation mark
            # and the closing quotation mark which bound the literal. A
            # string within a non-numeric literal is not a separate
            # text-word."
            } elsif ($library_line =~ /^\"(\\\"|[^\"])*\"/) {
              $library_text [@library_text] = "$&";
              $library_text [@library_text] = "";
              $library_line = "$'";
  
            } elsif ($library_line =~ /^\'(\\\'|[^\'])*\'/) {
              $library_text [@library_text] = "$&";
              $library_text [@library_text] = "";
              $library_line = "$'";
  
            } elsif ($library_line =~ /^(x|X)\"[0-9a-fA-F]*\"/) {
              $library_text [@library_text] = "$&";
              $library_text [@library_text] = "";
              $library_line = "$'";
  
            } elsif ($library_line =~ /^(x|X)\'[0-9a-fA-F]*\'/) {
              $library_text [@library_text] = "$&";
              $library_text [@library_text] = "";
              $library_line = "$'";
  
            } elsif ($library_line =~ /^(\+|-)?[0-9]+(\.[0-9]+)?/) {
              $library_text [@library_text] = "$&";
              $library_text [@library_text] = "";
              $library_line = "$'";
  
            # Third: "Text-words are: [...] Any other sequence of
            # characters delimited by separators, except comment lines
            # and the word COPY, bounded by separators."
            # Comment lines have been already taken care of. And I'm gonna
            # ignore the COPY-thing...
            } else {
              # Leading separators...
              $library_line =~ /^( |, |; |\. |:|\(|\)|==)*/;
              $last_spacing = $library_text [@library_text - 1];
              $library_text [@library_text - 1] = "$last_spacing$&";
              $library_line = "$'";

              # Text-word...
              $library_line =~ /( |$|,( |$)|;( |$)|\.( |$)|:|\(|\)|==)/;
              $library_text [@library_text] = "$`";
              $library_text [@library_text] = "";
              $library_line = "$&$'";
            }
          } # End of while line not empty.

          # Add what remains of line (eg "\n")...
          $last_spacing = $library_text [@library_text - 1];
          $library_text [@library_text - 1] = "$last_spacing$library_line";
          
        } # End of regular line.
      } # End while input.

      # Finally! We now have "tokenized" our inputs, and are ready to
      # do some real matching...

      # "Everything which precedes the first text-word in the library
      # text is copied into the compilation unit."
      print $fh_out (shift (@library_text));

      # "Staring with the first of the A-operands [...], each A-operand
      # is in turn compared with the corresponding number of library
      # text words, starting each time with the first library word.
      
      while (@library_text != 0) {
      	
      	# Try the A-operands until we have a match.
      	$match_found = 0;
      	$current_swatch = 0;
      	while ((not $match_found) and ($current_swatch < @swatches)) {
      	  $swatch = $swatches[$current_swatch];

          # Try matching the full swatch...
          $current_text_word = 0;
          $current_swatch_matches_sofar = 1;
          while ($current_text_word < (@$swatch / 2)
                 and $current_swatch_matches_sofar) {
          	$text_word = $$swatch[$current_text_word * 2];

            # Check word for word, ignoring case...
          	if ($library_text[$current_text_word * 2] !~ /^${text_word}$/i) {
          	  # Swatch fails.
              $current_swatch_matches_sofar = 0;
            } else {
              # Swatch still in the game.
              # Try next word...
              $current_text_word++;
            }
          } # End of trying current swatch.
          
          if ($current_swatch_matches_sofar) {
          	$match_found = 1;
          } else {
            $current_swatch++;
          }
      	} # End of trying swatches.

        if ($match_found) {
          # We found a match.
          # Output its replacement...
          print $fh_out ($replacements [$current_swatch]);
          
          # And loose the matching input...
          $current_text_word = 0;
          while ($current_text_word < @$swatch - 1) {
            shift (@library_text);
            shift (@library_text);
            $current_text_word++;
          }
          shift (@library_text);
          # Last spacing needs to be output as well...
          print $fh_out (shift (@library_text));

        } else {
          # Default: no match.
          # Output text-word and spacing.
          print $fh_out (shift (@library_text));
          print $fh_out (shift (@library_text));
        }
      } # End of scanning library text.

      # YES! FINALLY! It's crude, and will definitely have some
      # hidden bugs, but we now have support for the REPLACING
      # clause in COPY statements.
      
      # Unrelated note: Perl 5 syntax for multidimensional arrays
      # is a big pain in the ass. This approach of prefixing '$' or
      # '@' is not intuitive. And not having to first declare names
      # for later use has given me some hard to find bugs (because
      # typos are ignored). But its pattern matching capabilities
      # rule!

      # Some cleanup...
      @library_text = ();
      @swatches = ();

      # End of replacements.
    }

    # Close included file...
    close (COPY);

    # -- Finish up... -----------------------------------------------

    # Output copy end...
    print $fh_out ("      * \@yl:copy_end");
    if ($line_end =~ /\S/) {
      print $fh_out ("\n");
    }
    
    # Set up next input...
    $line = "$line_end";

    # Bookkeeping...
    $found_copy_count++;

  } else {

    # No comment. No COPY in line.
    # Pass on...
    print $fh_out ("$line");
    $line = "";

  } # End of different cases for line.

} continue {
  if ($line =~ /^\s*$/) {
  	print $fh_out ("$line\n");
  	$not_at_end = ($line = <$fh_in>);
  	chomp ($line);
  	$new_line = 1;
    $line_count++;

  } else {
  	$new_line = 0;
  }
}

# Optionally print a report...
if (!$quiet) {
  print STDERR ("Processed $line_count line(s).\n");
  print STDERR ("  $found_copy_count COPY statement(s) total.\n");
  print STDERR ("  $copys_with_replacing_count COPY(s) used REPLACING.\n");
  print STDERR ("  $pseudotext_count case(s) of pseudotext.\n");
}

exit ($found_copy_count);


# ===================================================================

sub usage {
  print STDERR ("Usage: preprocess_cobol [OPTIONS]...\n");
  print STDERR ("Where OPTIONS is one of:\n");
  print STDERR ("  -in FILE       read from FILE rather than STDIN\n");
  print STDERR ("  -out FILE      write to FILE rather than STDOUT\n");
  print STDERR ("  -include PATH  look for COPYs in PATH\n");
  print STDERR ("  -quiet         no report at end of run\n");
  print STDERR ("  -version       show version information\n");
  print STDERR ("  -help          print this message\n");
  print STDERR ("\n");
  exit (0);
}

# ===================================================================

sub version {
  print STDERR ("This is \"preprocess_cobol\", version 0.0.10\n");
  print STDERR ("Part of Yerna Lindale (old music).\n");
  exit (0);
}

# ===================================================================

sub file_not_found {
  print STDERR ("File not found: $_[0]\n");
  exit (-101);
}

# ===================================================================

sub cannot_open_file {
  print STDERR ("Cannot open file: $_[0]\n");
  exit (-102);
}

# ===================================================================

sub is_balanced {
    # We need to verify that all quotes are balanced...
    $possible_line = "$_[0]";

    # DEBUG: print STDERR ("1 $possible_line\n");

    # Loose everything but true quotes...
    $possible_line =~ s/\\.//g;
    $possible_line =~ s/[^\"\']//g;

    # DEBUG: print STDERR ("2 $possible_line\n");

    # Balance quotes...    
    while (length ($possible_line) > 1) {
      $possible_line =~ s/^\"\"//;
      $possible_line =~ s/^\'\'//;

      $possible_line =~ s/^\"\'/\"/;
      $possible_line =~ s/^\'\"/\'/;

      # DEBUG: print STDERR ("3 $possible_line\n");
    }

  if (length ($possible_line) == 1) {
    $last_quote = "$possible_line";
  }

  return length ($possible_line);
}

# ===================================================================

sub get_full_filename_for_copy {
  local ($name, $library);
  $name = $_[0];
  $library = $_[1];

  if ($name eq "") { return ""; }

  local ($true_name);
  $true_name = "";

  foreach $include_base (@includes) {
  	$true_name = &find_file ($include_base, $library, $name);
  	if ($true_name !~ /^$/) {
  	  return "$true_name";
  	}
  }

  return "";
}

# ===================================================================

sub find_file {
  local ($root, $name);
  $root = "$_[0]";
  if ($_[1] eq "") {
    $root = "$_[0]";
  } else {
    $root = "$root$_[1]/";
  }
  $name = "$_[2]";

  if ($name eq "") { return ""; }

  local ($uppercase_name);
  $uppercase_name = "$name";
  $uppercase_name =~ tr/a-z/A-Z/;

  local (@listing);
  opendir (ROOT, $root) || return "";
  @listing = readdir (ROOT);

  local ($file);
  foreach $file (@listing) {
  	local ($temp);
  	$temp = "$file";
    $temp =~ tr/a-z/A-Z/;
    
    if ($temp =~ /^$uppercase_name(\.(CPY|DEF))?$/) {
      return "$root$file";
    }
  }

  return "";
}
