#!/usr/bin/perl

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Yerna Lindale.
#
# The Initial Developer of the Original Code is
# the Software Engineering Lab, INTEC, University Ghent.
# Portions created by the Initial Developer are Copyright (C) 2004
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Kris De Schutter <kris.deschutter@ugent.be>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****

# This script takes a (Prolog) body of text, and looks for DOM
# annotations. It parses these annotations and translates them to
# equivalent Prolog statements. This makes it easier to describe
# some XML structures one wants to build from Prolog.
#
# The user must implement a predicate which defines how literals
# are built.
#
# [ DOM annotations ]
#
# A DOM annotation starts with a "<<" (minus the quotes) and ends
# at the first ">>" (minus the quotes). These markers will not be
# matched if they are part of a string literal.
#
# The format of a DOM annotation is described using the following
# grammar:
#
#   annotation : document "|" node_name node structure? ;
#
#   structure : ":" element+ ;
#
#   element :
#     literal | comment_node | node | node_name ("(" element+ ")")? ;
#
#   document : reference ;
#
#   node_name : identifier ;
#
#   node : reference ;
#
# [ Identifiers vs references ]
#
# A reference starts with an "@" (minus the quotes) and is followed
# by an identifier. For example "@MyNode".
#
# A reference refers to a DOM node. If it is used in the annotation
# part it means the associated identifier will be bound to the newly
# created node. If it is used within the structural part it means
# the node associated with the identifier will be used at that point.
#
# A reference to the document is compulsory, because we need a
# document as a factory to create new nodes.
#
# [ Comment nodes ]
#
# A comment node is exactly what it says it is. It represents a
# comment node in the DOM structure being built.
#
# Comment nodes start with a "[" and end at the first "]" (minus the
# brackets in both cases). Double quoted string literals are taken
# into account though. Anything between these brackets is seen as the
# contents of the comment to be generated.
#
#
# This tool is part of "Yerna Lindale" (old music).
#
# @author Kris De Schutter <kris.deschutter@ugent.be>
# @date Mon. Jun 7, 2004

use Getopt::Long;

# Options...
$help = 0;
$version = 0;
$quiet = 0;
$file_in = '';
$file_out = '';

# Process command line options...
GetOptions (
  'help|?' => \$help,
  'version' => \$version,
  'quiet' => \$quiet,
  'in:s' => \$file_in,
  'out:s' => \$file_out,
) or usage_and_exit ();

# Show help if requested...
if ($help) { usage_and_exit (); }

# Show version info if requested...
if ($version) { version_and_exit (); }

# Prepare input stream...
$fh_in = 'STDIN';
if ($file_in) {
  open (HANDLE_IN, $file_in) or file_not_found ($file_in);
  
  $fh_in = 'HANDLE_IN';
}

# Prepare output stream...
$fh_out = 'STDOUT';
if ($file_out) {
  open (HANDLE_OUT, ">$file_out") or cannot_open_file ($file_out);
  $fh_out = 'HANDLE_OUT';
}

# Bookkeeping...
$line_count = 0;
$annotation_count = 0;

# -- scanner --------------------------------------------------------

# This keeps track of tokens found...
$tokens = [];

# Scanner states...
$TEXT = 1;
$DOM = 2;

# Indicates current scanner state...
$scanner_state = $TEXT;

# Current line...
$line = '';

# Used in code generation...
$node_count = 0;

# For indentation of generated code...
$depth = 0;
$starting_depth = 0;

# While end of file has not been reached...
while ($end_of_file == 0) {

  # Read in new line if neccesary...
  if ($line =~ /^\s*$/) {
    # First output remaining whitespace though...
    print $fh_out ($line) unless ($scanner_state == $DOM or $line_count == 0);

    if ($line = <$fh_in>) {
      # Output newline (unless this is the first time)...
      print $fh_out ("\n") unless ($scanner_state == $DOM or $line_count == 0);

      # Reset depth...
      $depth = 0;
        
      # Loose trailing newlines...
      chomp ($line);
      
      # Bookkeeping...
      $line_count++;
    } else {
      # Flag end of file (not really necessary)...
      $end_of_file = 1;
      # Exit loop (this is why the previous is unnecessary)...
      last;
    }
  }

  # Check scanner state...
  if ($scanner_state == $TEXT) {
    # +-------------------------------------------------------------+
    # | TEXT TEXT TEXT TEXT TEXT TEXT TEXT TEXT TEXT TEXT TEXT TEXT |
    # +-------------------------------------------------------------+
    # Processing text (prolog code).
    
    # Match one of ", ' or <<...
    if ($line =~ /^([^<\"\']*)(\"|\'|<)(.*)$/) {
    
      if ($2 eq '<') {
        # Possible start of a DOM annotation.
        # First get rid of the part before this...
        print $fh_out ($1);
        $depth += length ($1);
        $line = "$3";

        if ($line =~ /^<(.*)$/) {
          # We have found the start of a DOM annotation.
          $line = "$1";
          # We keep hold of current depth...
          $starting_depth = $depth;
          # We set the scanner to the new state...
          $scanner_state = $DOM;
        } else {
          # We found < but not <<.
          print $fh_out ('<');
          $depth++;
        }

      } else {
        # We found the start of a literal.
        print $fh_out ("$1");
        $depth += length ($1);
        # Gather up match so far...
        local $text = "$2";
        local $delimiter = "$2";
        $line = "$3";

        # Scan for end of literal...
        local $scanning_literal = 1;
        while ($scanning_literal == 1) {

#          if ($line =~ /^(\\$delimiter)(.*)/) {
#            $text = "$text$delimiter$delimiter";
#            $line = "$2";

          if ($line =~ /^(\\.)(.*)/) {
            $text = "$text$1";
            $line = "$2";

          } elsif ($line =~ /^$delimiter(.*)/) {
            $text = "$text$delimiter";
            print $fh_out ($text);
            $depth += length ($text);
            $line = "$1";
            $scanning_literal = 0;

          } elsif ($line =~ /^$/) {
            scanner_error ("incomplete literal at line $line_count\n");

          } else {
            $line =~ /^([^\\$delimiter]+)(.*)$/;
            $text = "$text$1";
            $line = "$2";
          }
        }
      } # End of processing literal or "<<".

    } else {
      # No start of a DOM annotation found.
      # Copy verbatim to output...
      print $fh_out ($line);
      $depth += length ($line);
      $line = '';
    }

  } elsif ($scanner_state == $DOM) {
    # +-------------------------------------------------------------+
    # | DOM DOM DOM DOM DOM DOM DOM DOM DOM DOM DOM DOM DOM DOM DOM |
    # +-------------------------------------------------------------+
    # Processing a DOM annotation.

    # Skip whitespace...
    $line =~ /^\s*(.*)$/;
    $line = "$1";

    if ($line =~ /^>>(.*)$/) {

      # We have found the end of a DOM annotation.
      $annotation_count++;
      $line = "$1";

      # Parse tokens... --------------------------------------------!
      local $dap = parse_dap ($tokens);

      # Generate code... -------------------------------------------!
      generate_code ($starting_depth, 0, $dap);

      # Reset tokens...
      $tokens = [];
      
      # We switch scanner states...
      $scanner_state = $TEXT;

    } elsif ($line =~ /^(\"|\')(.*)/) {

      # A string literal...
      local $literal = "";
      local $delimiter = "$1";
      $line = "$2";

      local $scanning_for_literal = 1;
      while ($scanning_for_literal == 1) {

        if ($line =~ /^(\\.)(.*)/) {
          $literal = "$literal$1";
          $line = "$2";

        } elsif ($line =~ /^$delimiter(.*)/) {
          $line = "$1";
          $scanning_for_literal = 0;

        } elsif ($line =~ /^$/) {
          scanner_error ("incomplete literal at line $line_count\n");

        } else {
          $line =~ /^([^\\$delimiter]+)(.*)$/;
          $literal = "$literal$1";
          $line = "$2";
        }
      }

      # DEBUG:
      # print $fh_out ("% LITERAL: '$literal'\n");

      $tokens->[@$tokens] = "$delimiter$literal$delimiter";

    } elsif ($line =~ /^\[(.*)/) {

      # A comment node...
      local $comment = '';
      local $scanning = 1;
      $line = "$1";

      # Look for the closing curly bracket...
      while (($scanning == 1) and ($line !~ /^\s*$/)) {
  
        # Match one of " or ]...
        if ($line =~ /^([^\"\]]*)(\"|\])(.*)$/) {
  
          if ($2 eq ']') {
            # We found the closing square bracket.
            # Flag end of comment and gather up results...
            $scanning = 0;
            $comment = "$comment$1\n";
            $line = "$3";
  
            # DEBUG:
            # print STDERR ("CODE: $comment\n");
  
            # For good measure...
            chomp ($comment);
  
            # Build token...
            $tokens->[@$tokens] = "[ $comment ]";
  
          } else {
            # We found the start of a literal.
            # Gather up match so far...
            $comment = "$comment$1$2";
            $line = "$3";
  
            # Scan for end of literal...
            local $scanning_literal = 1;
            while ($scanning_literal == 1) {
  
              if ($line =~ /^(\\.)(.*)/) {
                $comment = "$comment$1";
                $line = "$2";
  
              } elsif ($line =~ /^\"(.*)/) {
                $comment = "$comment\"";
                $line = "$1";
                $scanning_literal = 0;
  
              } elsif ($line =~ /^$/) {
                scanner_error ("incomplete literal within comment at line $line_count\n");
  
              } else {
                $line =~ /^([^\\\"]+)(.*)$/;
                $comment = "$comment$1";
                $line = "$2";
              }
            }
          } # End of matching ", ' or {.
    
        } else {
          # No closing curly brace.
          scanner_error ("incomplete comment at line $line_count\n");
        }
      }

    } elsif ($line =~ /^([0-9]+)([^0-9].*)?$/) {

      # An integer literal...
      local $literal = "$1";
      $line = "$2";

      # DEBUG:
      # print $fh_out ("% LITERAL: $literal\n");

      $tokens->[@$tokens] = $literal;

    } elsif ($line =~ /^(\@?[a-zA-Z][a-zA-Z0-9_]*)([^a-zA-Z0-9_].*)?$/) {

      # An identifier or reference...
      local $identifier = "$1";
      $line = "$2";

      # DEBUG:
      # print $fh_out ("% IDENTIFIER: $identifier\n");

      $tokens->[@$tokens] = $identifier;

    } elsif ($line =~ /^(\(|\)|:|\|)(.*)$/) {

      # A structural token...
      local $token = "$1";
      $line = "$2";

      # DEBUG:
      # print $fh_out ("% TOKEN: $token\n");

      $tokens->[@$tokens] = $token;

    }

  } else {
    # This shouldn't happen unless someone has been fooling with the
    # scanner code...
    scanner_error ("scanner in unknown state ($scanner_state)");
  }

} # End of processing input.

close ($fh_in);
close ($fh_out);

# -- endgame --------------------------------------------------------

# Optionally print a report...
if (!$quiet) {
  local $rule_count = @$rules;
  print STDERR ("DAP file '$file_in':\n");
  print STDERR ("  Processed $line_count line(s).\n");
  print STDERR ("  Found $annotation_count DOM annotation(s).\n");
}

exit(0);

# ===================================================================
# Boilerplating for script...                                       =
# ===================================================================

sub usage_and_exit {
  print STDERR ("Usage: daptopl [OPTIONS]...\n");
  print STDERR ("Where OPTIONS is one of:\n");
  print STDERR ("  -in FILE      read from FILE rather than STDIN\n");
  print STDERR ("  -out FILE     write to FILE rather than STDOUT\n");
  print STDERR ("  -quiet        no report at end of run\n");
  print STDERR ("  -version      show version information\n");
  print STDERR ("  -help         print this message\n");
  print STDERR ("\n");
  exit (0);
}

# ===================================================================

sub version_and_exit {
  print STDERR ("This is \"daptopl\", version 0.0.0.\n");
  print STDERR ("Part of Yerna Lindale (old music).\n");
  exit (0);
}

# ===================================================================

sub file_not_found {
  print STDERR ("File not found: $_[0]\n");
  exit (-101);
}

# ===================================================================

sub cannot_open_file {
  print STDERR ("Cannot open file: $_[0]\n");
  exit (-102);
}

# ===================================================================

sub scanner_error {
  print STDERR ("Scanner error: $_[0]\n");
  exit (-120);
}

# ===================================================================

sub parse_error {
  print STDERR ("Parse error: $_[0]\n");
  exit (-140);
}

# ===================================================================
# Methods for parsing...                                            =
# ===================================================================

# These are ment to match tokens, not arbitrary strings !

sub is_identifier {
  return $_[0] =~ /^([a-zA-Z][a-zA-Z0-9_]*)$/;
}

sub is_reference {
  return $_[0] =~ /^(\@[a-zA-Z][a-zA-Z0-9_]*)$/;
}

sub is_literal {
  return $_[0] =~ /^(\".*\"|\'.*\'|[0-9]+)$/;
}

# ===================================================================

sub parse_dap {

  local $tokens = $_[0];
  local $position = 0;
  local $structure = [];

  # REFERENCE...
  if (not is_reference ($tokens->[$position])) {
    parse_error ('DAP does not have a reference to a document');
  }

  $structure->[@$structure] = $tokens->[$position];
  $position++;

  # VERTICAL LINE...
  if (not ($tokens->[$position] =~ /^\|$/)) {
    parse_error ('expected a vertical line');
  }
  $position++;

  # IDENTIFIER...
  if (not is_identifier ($tokens->[$position])) {
    parse_error ('DAP does not start with a node name');
  }

  $structure->[@$structure] = $tokens->[$position];
  $position++;

  # REFERENCE...
  if (not is_reference ($tokens->[$position])) {
    parse_error ('DAP does not have a reference');
  }

  $structure->[@$structure] = $tokens->[$position];
  $position++;

  if ($position < @$tokens) {
    # COLON...
    if (not ($tokens->[$position] =~ /^:$/)) {
      parse_error ('expected a colon');
    }
    $position++;

    # A stack...
    local $stack = [];
    # Bottom element is the structure we're building...
    $stack->[0] = $structure;
    # Index of last item (must be zero or higher)...
    local $top = 0;

    # While there are tokens in the input...
    while ($position < @$tokens) {

      if ($tokens->[$position] =~ /^\($/) {
        # Token is an opening brace.

        # Make sure we have a willing parent...
        if ($tokens->[$position - 1] eq ')' or $tokens->[$position - 1] eq ':') {
          parse_error ('found subnodes without a parent node');
        }

        # Retrieve parents, uncles and aunties...
        local $family = $stack->[$top];

        # Flag that parent has children...
        $family->[@$family] = '+';
        # Start a new family...
        $family->[@$family] = $stack->[++$top] = [];

      } elsif ($tokens->[$position] =~ /^\)$/) {
        # Token is a closing brace.
        if ($top == 0) {
          # Stack is empty (don't forget that the bottom-most element
          # can't be removed!)...
          parse_error ('unexpected closing bracket');

        } else {
          # Pop the last family of nodes...
          $top--;
        }

      } elsif ($tokens->[$position] =~ /^\[.*\]$/) {
        # Token is a comment node.
        local $family = $stack->[$top];
        # Add the newborn...
        $family->[@$family] = $tokens->[$position];

      } elsif (is_identifier ($tokens->[$position])
           or  is_reference ($tokens->[$position])
           or  is_literal ($tokens->[$position])) {
        # Token is a literal.
        local $family = $stack->[$top];
        # Add the newborn...
        $family->[@$family] = $tokens->[$position];
        
      } else {
        # This shouldn't happen unless someone has been playing with
        # the tokens.
        local $riddle = $tokens->[$position];
        parse_error ("unknown token $riddle");
      }

      # Next token...
      $position++;
    }
  }

  return $structure;
}

# ===================================================================
# Methods for code generation...                                    =
# ===================================================================

sub indented {
  local $depth = $_[0];
  local $text = "$_[1]";

  # Every line in $text will be printed with an indentation of
  # $depth number of spaces.

  while ($text =~ /^([^\n]*)\n(.*)$/) {
    for (local $d = $depth; $d > 0; $d--) {
      print $fh_out (' ');
    }
    print $fh_out ("$1\n");
    $text = "$2";
  }

  if ($text ne '') {
    for (local $d = $depth; $d > 0; $d--) {
      print $fh_out (' ');
    }
    print $fh_out ($text);
  }
}

# ===================================================================

sub as_identifier {
  local $reference = $_[0];
  $reference =~ /^\@?(.*)$/;
  return "$1";
}

# ===================================================================

sub generate_code {
  local $depth = $_[0];
  local $level = $_[1];
  local $dap = $_[2];
  local $parent = $_[3];
  local $document = $_[4];

  local $start = 0;

  if ($level == 0) {
    $document = as_identifier ($dap->[0]);
    local $root_node = $dap->[1];
    local $root_name = as_identifier ($dap->[2]);
    print $fh_out ("xml_make_element($document, '$root_node', $root_name)");
    $start = 3;
    $parent = $root_name;
  }

  for (local $i = $start; $i < @$dap; $i++) {
    local $node = $dap->[$i];
    
    if ($level == 0 || $i > $start) {
      print $fh_out (",\n");
    }

    if ($node eq '+') {
      local $children = $dap->[++$i];
      generate_code ($depth, $level + 1, $children, "DAP_N$node_count", $document);

    } elsif ($node =~ /\[ (.*) \]/) {
      local $comment = "$1";
      $comment =~ s/\'/\\\'/g;

      $node_count++;
      indented ($depth + 2 * $level + 2, "xml_make_comment($document, '$comment', DAP_N$node_count),\n");
      indented ($depth + 2 * $level + 2, "xml_append_child(DAP_N$node_count, $parent)");

    } elsif (is_literal ($node)) {
      if ($node =~ /^\"(.*)\"$/) {
        $node = "'$1'";
	# Following is needed to comply to the ISO-Prolog standard for quotes in string literals.
        $node =~ s/\\\'/\'\'/g;
        $node =~ s/\\\"/\"/g;
      }
      $node_count++;
      indented ($depth + 2 * $level + 2, "create_literal($document, $node, DAP_N$node_count),\n");
      indented ($depth + 2 * $level + 2, "xml_append_child(DAP_N$node_count, $parent)");

    } elsif (is_identifier ($node)) {
      $node_count++;
      indented ($depth + 2 * $level + 2, "xml_make_element($document, '$node', DAP_N$node_count),\n");
      indented ($depth + 2 * $level + 2, "xml_append_child(DAP_N$node_count, $parent)");

    } elsif (is_reference ($node)) {
      local $name = as_identifier ($node);
      indented ($depth + 2 * $level + 2, "xml_append_child($name, $parent)");

    }

  }
}
