#!/usr/bin/perl

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Yerna Lindale.
#
# The Initial Developer of the Original Code is
# the Software Engineering Lab, INTEC, University Ghent.
# Portions created by the Initial Developer are Copyright (C) 2004
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Kris De Schutter <kris.deschutter@ugent.be>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****

# This script "normalizes" a terminal-formatted Cobol source program.
#
# "Normalisation" applies to comments and continuation lines.
# All forms of comments are reduced to just one standard type.
# All continuation lines are removed while preserving the literals.
# Null characters and backslashes ('\') are escaped (so flex won't
# have any problems parsing).
#
# This tool is part of "Yerna Lindale" (old music).
#
# @author Kris De Schutter <kris.deschutter@ugent.be>
# @date Fri. March 12, 2004

use Getopt::Long;

# Options...
$help = 0;
$version = 0;
$quiet = 0;
$skip_idents = 0;
$file_in = '';
$file_out = '';

# Process command line options...
GetOptions (
  'help|?' => \$help,
  'version' => \$version,
  'quiet' => \$quiet,
  'skip_idents' => \$skip_idents,
  'in:s' => \$file_in,
  'out:s' => \$file_out
) or usage ();

# Show help if requested...
if ($help) { usage (); }

# Show version info if requested...
if ($version) { version (); }

# Prepare input stream...
$fh_in = 'STDIN';
if ($file_in) {
  open (HANDLE_IN, $file_in) or file_not_found ($file_in);
  $fh_in = 'HANDLE_IN';
}

# Prepare output stream...
$fh_out = 'STDOUT';
if ($file_out) {
  open (HANDLE_OUT, ">$file_out") or cannot_open_file ($file_out);
  $fh_out = 'HANDLE_OUT';
}

# Bookkeeping...
$line_count = 0;
$sequence_numbers = 0;
$identifications = 0;
$debug_lines = 0;
$new_pages = 0;
$comments = 0;
$continuation_lines = 0;
$source_code = 0;
$comment_entries = 0;
$in_comment_entry = 0;

# Necessary for continuation lines...
@output_lines = ();

# For prosperity...
print $fh_out ("      * \@yl:format TERMINAL COBOL\n");

# Let's do some real work...
while ($line = <$fh_in>) {
  # Loose trailing newlines...
  chomp ($line);

  # -- phase 0 ------------------------------------------------------

  # Check for comment entries...
  if ($in_comment_entry) {
    if ($line =~ /^\s{4}(.*)$/) {
      # Still processing comment entry.
      # Normalize...
      $output_lines [@output_lines] = "      * \@yl:comment-entry $1";

      # Bookkeeping...
      $comment_entries++;

      # Other phases have just become useless...
      next;

    } else {
      # No more comment entries.
      $in_comment_entry = 0;
    }
  }

  if ($line =~ /^(author|installation|date-written|date-compiled|security|remarks)\.(.*)$/i) {
    # Welcome to the wonderful world of comment entries.
    # Normalize...
    $output_lines [@output_lines] = "      * \@yl:comment-entry-start $1.$2";

    # Flag for next pass...
    $in_comment_entry = 1;

    # Bookkeeping...
    $comment_entries++;

    # Other phases have just become useless...
    next;
  }

  # -- phase 1 ------------------------------------------------------

  # Check for comment lines...
  if ($line =~ /^(\*|\$)(.*)$/) {
    # Comment.
    # Normalize...
    $output_lines [@output_lines] = "      *$2";

    # Bookkeeping...
    $comments++;
    
    # Other phases have just become useless...
    next;

  } elsif ($line =~ /^\/(.*)$/) {
    # New page request.
    # Normalize...
    $output_lines [@output_lines] = "      * \@yl:newpage $1";

    # Bookkeeping...
    $new_pages++;
    
    # Other phases have just become useless...
    next;

  } elsif ($line =~ /^\\D(.*)$/) {
    # New page request.
    # Normalize...
    $output_lines [@output_lines] = "      * \@yl:debug $1";

    # Bookkeeping...
    $debug_lines++;
    
    # Other phases have just become useless...
    next;
  }

  # -- phase 2 ------------------------------------------------------

  # We check for "identifications"...
  if ($skip_idents) {
    
    # Do not look for identifications.
    # Skip this phase.
    
  } elsif ($line =~ /^(.*)\|([^\|]*)$/) {

    # We have found a possible identification.
    # Save partial matches for later...
    $remainder = "$1";
    $ident = "$2";

    if (is_balanced ($remainder) == 0) {
      # All quotes have been balanced.
      # Line has an identification.

      # Normalize...
      $output_lines [@output_lines] = "      * \@yl:ident $ident";
      $line = $remainder;
  
      # Bookkeeping...
      $identifications++;
    }
  }

  # -- phase 3 ------------------------------------------------------

  if ($line =~ /^-/) {
    
    # Continuation line.
    # Check whether we're continuing anything...
    if (@output_lines == 0) {
      print STDERR ("Error: no line to continue ($line_count).\n");
      exit (1);
    }

    # "A hyphen in a line's indicator area causes the first nonblank
    # character in Area B to be the immediate successor of the last
    # nonblank character of the preceding line. This continuation
    # excludes intervening comment lines and blank lines."
    
    # "Note that if the continued line ends with a nonnumeric literal
    # without a closing quotation mark, the first nonblank character
    # in Area B of the continuation line must be a quotation mark. The
    # continuation starts immediately after the quotation mark. All
    # spaces at the end of the continued line are part of the literal."
    
    # "If the indicator area of the continuation line is blank, then
    # the compiler treats the last nonblank character of the preceding
    # line as if it were followed by a space."

    # First we need to get the correct line to continue...
    $i = @output_lines - 1;
    $continued_line = "";

    # Skip all blank lines and comment line
    while ($i >= 0) {
      $continued_line = $output_lines [$i];
      if ($continued_line !~ /^\s*$/ and $continued_line !~ /^.{6}\*/ ) {
        last;
      }
      $i--;
    }

    # Make sure we've found a suitable line...
    if ($i < 0) {
      print STDERR ("Error: no line to continue ($line_count).\n");
      exit (1);
    }

    # Bubble continued line to top of the stack...
    while ($i < (@output_lines - 1)) {
      $output_lines [$i] = $output_lines [$i + 1];
      $i++;
    }
    $output_lines [@output_lines - 1] = $continued_line;

    if (is_balanced ($continued_line) != 0) {
      # Previous line had unclosed nonnumeric literal.
      # Get to start of continued literal...
      if (($line =~s/^-[^\"\']*(\"|\')//) == 0) {
        # No quotation mark...
        print STDERR ("Error: bad continuation line ($line_count).\n");
        exit (1);
      }
      
      # Weld it all together...      
      $continued_line = "$continued_line$line";

    } else {
      # No unfinished nonnumeric literal.

      # Blank indicator area ?
      $space = "";
      if ($line =~ /^-\s{4}/) { $space = " ";}

      # Get to start of continued line...
      if (($line =~ s/^-.{4}\s*(\S)/\1/) == 0) {
        # All blanks...
        print STDERR ("Error: bad continuation line ($line_count).\n");
        exit (1);
      }
      
      # Loose final blanks on continued line...
      $continued_line =~ s/(\S)\s*$/\1/;

      # Weld it all together...      
      $continued_line = "$continued_line$space$line";
    }

    # Update last line on stack...
    $output_lines [@output_lines - 1] = $continued_line;
  
    # Bookkeeping...
    $continuation_lines++;

  } else {
    
    # Basic code; just pass it on...
    flush_previous_lines ();
    $output_lines [@output_lines] = "$line";

    # Bookkeeping...
    $source_code++;
  }

  # -- finally ------------------------------------------------------

  # Bookkeeping...
  $line_count++;
}

flush_previous_lines ();

close ($fh_in);
close ($fh_out);

# Optionally print a report...
if (!$quiet) {
  print STDERR ("Processed $line_count line(s).\n");
  print STDERR ("  $sequence_numbers sequence number(s).\n");

  if ($skip_idents) {
    print STDERR ("  $identifications identification(s). (ignored)\n");
  } else {
    print STDERR ("  $identifications identification(s).\n");
  }

  print STDERR ("  $debug_lines debug line(s).\n");
  print STDERR ("  $new_pages line(s) with a new page directive.\n");
  print STDERR ("  $comments line(s) with standard comments.\n");
  print STDERR ("  $continuation_lines continuation line(s).\n");
  print STDERR ("  $comment_entries comment entry(s).\n");
  print STDERR ("  $source_code line(s) of basic code.\n");
}

# ===================================================================

sub usage {
  print STDERR ("Usage: normalize_terminal_cobol [OPTIONS]...\n");
  print STDERR ("Where OPTIONS is one of:\n");
  print STDERR ("  -in FILE      read from FILE rather than STDIN\n");
  print STDERR ("  -out FILE     write to FILE rather than STDOUT\n");
  print STDERR ("  -quiet        no report at end of run\n");
  print STDERR ("  -skip_idents  do not look for identifications\n");
  print STDERR ("  -version      show version information\n");
  print STDERR ("  -help         print this message\n");
  print STDERR ("\n");
  exit (0);
}

# ===================================================================

sub version {
  print STDERR ("This is \"normalize_terminal_cobol\", version 0.0.1\n");
  print STDERR ("Part of Yerna Lindale (old music).\n");
  exit (0);
}

# ===================================================================

sub file_not_found {
  print STDERR ("File not found: $_[0]\n");
  exit (-101);
}

# ===================================================================

sub cannot_open_file {
  print STDERR ("Cannot open file: $_[0]\n");
  exit (-102);
}

# ===================================================================

sub flush_previous_lines {
  foreach $output_line (@output_lines) {
    $output_line =~ s/\\/\\\\/g;
    $output_line =~ s/\0/\\0/g;
    print $fh_out ("$output_line\n");
  }
  @output_lines = ();
}

# ===================================================================

sub is_balanced {
    # We need to verify that all quotes are balanced...
    $possible_line = "$_[0]";

    # DEBUG: print STDERR ("1 $possible_line\n");

    # Loose everything but true quotes...
    $possible_line =~ s/\\.//g;
    $possible_line =~ s/[^\"\']//g;

    # DEBUG: print STDERR ("2 $possible_line\n");

    # Balance quotes...    
    while (length ($possible_line) > 1) {
      $possible_line =~ s/^\"\"//;
      $possible_line =~ s/^\'\'//;

      $possible_line =~ s/^\"\'/\"/;
      $possible_line =~ s/^\'\"/\'/;

      # DEBUG: print STDERR ("3 $possible_line\n");
    }

  return length ($possible_line);
}
