#! /usr/bin/perl


$version = "0.13";

# These are the routines that you would use if you want to use
# the Quark-to-HTML converter as a component of a larger processing
# program. It is currently used by create-issue.pl

# Copyright 1994-5, The Tech
# By Jeremy Hylton
# jeremy@the-tech.mit.edu
# based on q2w3.pl by Jeremy Hylton and Reuven Lerner
# ------------------------------------------------------------
# qt2www is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.

# qt2www is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.

# You should have received a copy of the GNU General Public License
# along with qt2www; see the file COPYING.  If not, write to the Free
# Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
# ------------------------------------------------------------

# Revision History
# Dec   6       Jer     Created from the gutted qt2www.pl
# Dec  10       Jer     bug fix: continuing non-<P> styles over more than
#                       one paragraph.
# Feb   7 1995  Jer     added preliminary support for <$>
# Feb  13       Jer     bug treated things like <f$> that same as <$>, fixed
# Feb  27       Jer     Emph processing: searching for <Normal> instead of for
#                       <P> to insert emphasis
# Feb  27       Jer     moved initialization of $intag, $inquote to immediately
#                       before the B/I processing loop. continued lines had
#                       been leaving $intag set.
# Feb  27       Jer     test for b/i = $chars[$i] eq 'B' && $inquote eq 'False'
#                       fonts with 'B' or 'I' had been turning on bold/italic

# list of possible tag fields
%TagNames = ('Html', 0,
	     'Sep', 1, 
	     'Emph',2, 
	     'RuleB', 3,
	     'RuleA', 4,
	     );

sub MainProcessingRoutine
{
    local ($input, $output) = @_;

    open (SRC, "$input");
    open (OUT, ">$output");

    $STATE{'bold'} = 'False';
    $STATE{'italic'} = 'False';
    $prev = 'FALSE';

    $part_of_file = 'defn';

    while (<SRC>)
    {
	# processing to do: bold/italic, tags, etc.
	# first read the QuarkTag definitions
	if ($part_of_file eq 'defn')
	{
	    if (/\@([^\t\=]+)\=/)
	    {
		$defn = $1;
		if (&GetTag ($defn, 'Html') eq '')
		{
		    &status ($status_output, "Unknown tag: $defn\n");
				# could keep track of inheritance
				# and fill in missing entries at the end
		    &CopyTag ('qt2www_default', $defn);

		    if (/\@([^\t\=]+)\=\[S\"(\w+)\"\](.*)/)
		    {
			&status ($status_output, "\tBased on $2\n");
		    }
		}
	    }
	    elsif (/\@([^\t\=]+)\:/)
	    {
		$part_of_file = 'body';
	    }
	}

	# not an elsif, because the first line of the body is discovered
	# above and passed through to the second if


	if ($part_of_file eq 'body')
	{
	    # remove trailing linefeeds
	    if (/\n$/)
	    {
		chop;
	    }

s/^(\@[^:]+):/$1\@\@\@/g;

	    if ($prev eq 'TRUE')
	    {
		$_ = $_prev . $_;
	    }

	    # in general, information that needs to be placed in <> for HTML
	    # is placed within double underscores, __, during processing, 
	    # because QuarkTags also use <> and we want to eliminate QuarkTags.

	    # begin bold/italic processing

	    # if we are starting a new tag, then we need to reset the state
	    if (/^\@.*\@\@\@/)
	    {
		foreach $key (keys %STATE)
		{
		    $STATE{$key} = 'False';
		}
	    }

	    # if we are in the same style, we may need to turn b/i on
	    if ($STATE{'bold'} eq 'True')
	    {
		$prepend_bold = 'True';
	    }
	    else
	    {
		$prepend_bold = 'False';
	    }

	    if ($STATE{'italic'} eq 'True')
	    {
		$prepend_italic = 'True';
	    }
	    else
	    {
		$prepend_italic = 'False';
	    }

	    # try to eliminate all the font information first
	    s/(<[^<>\n]*)\"[\w\-]+\"([^<>\n]*>)/$1$2/g;

	    # not sure if the array is the right way to go, but the 
	    # representation is conceptually simple for the search for B and I

	    @chars = split ('',$_);

	    $intag = 'False';
	    $inquote = 'False';
	    for ($i = 0; $i <= $#chars; ++$i)
	    {
		if ($chars[$i] eq '<')
		{
		    $intag = 'True';
		}
		elsif ($intag eq 'True')
		{
			if ($chars[$i] eq '"')
			{
		    if ($inquote eq 'True')
		    {
			    $inquote = 'False';
			} else {
				$inquote = 'True';
			}
			}

		    if ($chars[$i] eq 'B' && $inquote eq 'False')
		    {
			if ($STATE{'bold'} eq 'False')
			{
			    $chars[$i] = "__BOLD_ON__";
			    $STATE{'bold'} = 'True';
			}
			else
			{
			    $chars[$i] = "__BOLD_OFF__";
			    $STATE{'bold'} = 'False';
			}
		    }
		    elsif ($chars[$i] eq 'I' && $inquote eq 'False')
		    {
			if ($STATE{'italic'} eq 'False')
			{
			    $chars[$i] = "__ITALIC_ON__";
			    $STATE{'italic'} = 'True';
			}
			else
			{
			    $chars[$i] = "__ITALIC_OFF__";
			    $STATE{'italic'} = 'False';
			}
		    }
		    elsif (($chars[$i] eq '$' && $chars[$i-1] eq '<') 
                           || ($chars[$i] eq 'P'))
		    {
			$subst = "";
			if ($STATE{'italic'} eq 'True')
			{
			    $subst .= "__ITALIC_OFF__";
			}
			if ($STATE{'bold'} eq 'True')
			{
			    $subst .= "__BOLD_OFF__";
			}
			foreach $key (keys %STATE)
			{
			    $STATE{$key} = 'False';
			}
			$chars[$i] = $subst;
		    }
		    elsif ($chars[$i] eq '>')
		    {
			$intag = 'False';
		    }
		}
	    }

	    if ($intag eq 'True')
	    {
		$_prev = $_;
		$prev = 'TRUE';
		next;
	    }
	    else
	    {
		$prev = 'FALSE';
		$_prev = "";
	    }

	    # bold and italics replaced, put line back together
	    $_ = join ('', @chars);

	    # if any styles are currently on, we need to turn them
	    # off before closing the tag
	    if ($STATE{'bold'} eq 'True')
	    {
		$_ .= "<__BOLD_OFF__>";
	    }
	    if ($STATE{'italic'} eq 'True')
	    {
		$_ .= "<__ITALIC_OFF__>";
	    }

	    # and if any where on when we entered, then we need to prepend them
	    if ($prepend_bold eq 'True')
	    {
		$_ = "__BOLD_ON__" . $_;
	    }
	    if ($prepend_italic eq 'True')
	    {
		$_ = "__ITALIC_ON__" . $_;
	    }

	    # move bold and italic info outside of Quark tags

	    # need a different pattern if its a BI
	    # otherwise either B or I will get left inside
	    s/(<[^<>\n]*)(__[BOLDITAC]+_O[NF]+__)(__[BOLDITAC]+_O[NF]+__)([^<>\n]*>)/$1$4$2$3/g;

	    s/(<[^<>\n]*)(__[BOLDITAC]+_O[NF]+__)([^<>\n]*>)/$1$3$2/g;

	    # try to process all the Quark tags

	    # first remove the manual linebreak, because spaces mean nothing
	    # to HTML
	    s/<\\n>/ /g;
	    s/<\*[^<>]*>//g;

	    # no support for emphasized normal text
	    if (/\@[^\t\=]+\@\@\@/)
	    {
		foreach $key (keys %tag)
		{
		    if (/\@$key/)
		    {
			$current_tag = $key;
		    }
		    if (&GetTag ($key, 'Html') eq 'Normal')
		    {
			$sep = &GetTag ($key, 'Sep');
			if ($sep eq 'P')
			{
			    s/@$key\@\@\@(.*)/__P__$1__\/P__/;
			}
			else
			{
			    s/@$key\@\@\@(.*)/$1__BR__/;
			}
		    }
		    else
		    {
			if (/$key/)
			{
			    s/@$key\@\@\@(.*)/$1/g;
			    $html = &GetTag ($key, 'Html');
			    $_ = "__" . $html . "__" . $_ . "__\/" . $html . "__";
			}
		    }
		}
	    }
	    elsif (length ($_) > 1)
	    {
		if (&GetTag ($current_tag, 'Html') eq 'Normal')
		{
		    if (&GetTag ($current_tag, 'Sep') eq 'P')
		    {
			$_ = "__P__" . $_ . "__\/P__";
		    }
		    else
		    {
			$_ = $_ . "__BR__";
		    }
		}
		else
		{
		    $html = &GetTag ($current_tag, 'Html');
		    $_ = "__" . $html . "__" . $_ 
			. "__\/" . $html . "__";
		}
	    }

	    # if there are any tag-like things left, kill them
	    s/\@\@+//;

	    $ret = &EscapedCharacters ($_);
	    $_ = $ret;

	    # turn __bold__ and __italic__ into html markup
	    # and check for bold or italic spaces
	    s/__BOLD_ON__(\s*)__BOLD_OFF__/$1/g;
	    s/__ITALIC_ON__(\s*)__ITALIC_OFF__/$1/g;

	    s/__BOLD_ON__/<B>/g;
	    s/__BOLD_OFF__/<\/B>/g;
	    s/__ITALIC_ON__/<I>/g;
	    s/__ITALIC_OFF__/<\/I>/g;

	    # finished translations can turn _\w_ into <\w>
	    s/__([^_\s]+)__/<$1>/g;

	    # sometimes we get <B><I></B>text</I> which is bad
	    $ret = &PostProcessBI ($_);
	    $_ = $ret;

	    # now seems like a really good time to add the emphasis modes
	    # what we'll do is add something, like B, at the beginning and
	    # end and strip out all reference to it inside

	    $emph = &GetTag ($current_tag, 'Emph');
	    $html = &GetTag ($current_tag, 'Html');
	    $sep = &GetTag ($current_tag, 'Sep');
	    if ( $emph eq 'B' || $emph eq 'I' || $emph eq 'EM'
		|| $emph eq 'STRONG' )
	    {
		$ret = &Strip ($emph, $_);
		$_ = $ret;

		if ($html eq 'Normal')
		{
		    $html = &GetTag ($current_tag, 'Html');
		}

		if ($sep eq 'BR')
		{
		    $_ = "<" . $emph . ">" . $_;
		    s#<BR>$#</$emph><BR>#i;
		}
		else
		{
		    if ($html eq 'Normal')
		    {
			$tag = 'P';
		    }
		    else
		    {
			$tag = $html;
		    }
		    s#^<$tag>#<$tag><$emph>#;
		    s#</$tag>$#</$emph></$tag>#;
		}
	    }
	    elsif ($emph eq 'HR')
	    {
		if ($html eq 'Normal')
		{
		    $html = &GetTag ($current_tag, 'Html');
		}

		if ($sep eq 'BR')
		{
		    s#<BR>$#<HR><BR>#i;
		}
		else
		{
		    s#</$html>$#</$html><HR>#;
		}
	    }

	    # clean up extraneous whitespace, because HTML ignores it
	    s/(\s)\s+/$1/g;

#	    $len = length ($_);
#	    for ($i = 0; $i < $len; $i += $length)
#	    {
#		$length = 75;
#		if (($len - $i) > 75)
#		{
#		    while (substr ($_, $i + $length, 1) ne " "
#			   && $length > 0)
#		    {
#			$length--;
#		    }
#		}
#		$length++;
#		if ($length < 1)
#		{
#		    $length = 1;
#		}
#		$line = substr ($_, $i, $length);

		$line = $_;

		# final (Fil)
		$line =~ s#^<P>\@[^=]+=(\[[^]]*\])?</P>##;

		# attention peut s'tendre sur plusieurs lignes !
		$line =~ s/\@TIMES-inter<P>([^@]+)\@TIMES-Lettrine 2L:/\n\n{{{\1}}}\n\n/;

		$line =~ s/\@?(TIMES-TXT)?<P>/\n\n/;
		$line =~ s/(<\/P>)+//;
		$line =~ s/\@TIMES-Lettrine 3L//;
		$line =~ s/^(<B>)?\@.*//;
		$line =~s//oe/g;

		print "$line\n" if ($line);
	}
    }

    if ($STATE{'bold'} eq 'True')
    {
	print OUT "</B>\n";
    }
    if ($STATE{'italic'} eq 'True')
    {
	print OUT "</I>\n";
    }

    close (SRC);
    close (OUT);
}


sub read_map
{
    local ($map) = @_;

    # QT map file format
    # tab, comma, or space delimited fields
    # each entry separated by a newline
    # quarktag name, HTML+ value, separator, emphasis value

    &SetTagValue ('qt2www_default', 'Html', 'Normal' );
    &SetTagValue ('qt2www_default', 'Sep', 'P' );
    &SetTagValue ('qt2www_default', 'Emph', "" );

}

sub status 
{
    local ($status, $output) = @_;

    if ($status eq 'yes')
    {
	print "$output";
    }
}

sub GetTag
{
    local ($style_name, $tag_name) = @_;
    local (@tags);

    @tags = split (/\|/, $tag{$style_name});

    $tags[$TagNames{$tag_name}];
}

sub SetTagValue
{
    local ($style_name, $tag_name, $tag_value) = @_;
    local (@tags);

    @tags = split (/\|/, $tag{$style_name});
    $tags[$TagNames{$tag_name}] = $tag_value;
    $tag{$style_name} = join ('|', @tags);
}

sub CopyTag
{
    local ($source, $dest) = @_;

    $tag{$dest} = $tag{$source};
}

sub EscapedCharacters
{
    local ($line) = @_;
    $_ = $line;

    # replace escaped QT characters with escaped HTML characters
    # many thanks to Ernst Smith for coding most of this section


	s/\x8A/.../g;		# pointille degoutant

	s/<\\!q>/~/g;
	s/<\\!->/-/g;
	s/<\\!s>/ /g;
	s/<\\!f>/ /g;

    s/<>//g;
    s/\c\@//g;			# Remove ^A
    s/\cA//g;			# Remove ^A
    s/\cL//g;			# Remove ^L

    s/<\\2>//g;			# Remove <\2> 
    s/<\\h>//g;			# Remove <\h> 
    s/<\\b>//g;			# Remove <\b> 

    s/<\+z\d*>//g;		# Get rid of funny baseline things

    s/<\\#34>/"/g;		# Double quotes
    s/<\\#39>/'/g;		# Single quote

    s/<\\#128>/A"/g;		# A"
    s/<\\#129>/Ao/g;		# Ao
    s/<\\#130>/&Ccedil;/g;	# C,
    s/<\\#131>/E'/g;		# E'
    s/<\\#132>/&Ntilde;/g;	# N~
    s/<\\#133>/A"/g;		# A"
    s/<\\#134>/U"/g;		# U"
    s/<\\#135>/a'/g;		# a'
    s/<\\#136>/&acirc;/g;	# a^

    s/<\\@>/@/g;		# @ ("at" sign)	
    s/<\\<>/&lt;/g;		# Left angle bracket
    s/<\\#201>//g;		# Ellipsis
    s/<\\#208>/-/g;		# N-dash
    s/<\\#209>/--/g;		# M-dash
    s/<\\#211><\\q><\\#213>/" '/g;
    # Double quotes followed by single quotes
    s/<\\#210>/"/g;		# Double quotes (smart)
    s/<\\#211>/"/g;		# Double quotes (smart)
    s/<\\#212>/'/g;		# Single quote (smart)
    s/<\\#213>/'/g;		# Single quote (smart)

    s/<\\q>/ /g;		# Thin space

    # final cleanup
    # eliminate any quark tags that haven't been handled
    s/<[^<>\n]*>//g;

    tr/\320/-/;
    s/\321/--/g;
    s/\245/*/g;			# Temporary fix for bullets (option-8)
    tr/\322/\"/;
    tr/\323/\"/;
    tr/\324/\`/;
    tr/\325/\'/;

    tr/\271/\'/;
    tr/\263/\"/;
    tr/\262/\"/;
    tr/\213/\-/;

    $_;
}

sub PostProcessBI
{
    local ($line) = @_;

    $_ = $line;

    # exhaustive list of cases, would like more abstract reordering
    s#<B></I>#</I><B>#g;
    s#<I></B>#</B><I>#g;
    s#<I></I>##g;
    s#<B></B>##g;

    $_;
}

sub Strip
{
    local ($emph, $line) = @_;

    $line = $_;

    s#<$emph>##g;
    s#</$emph>##g;

    $_;
}

sub Debug
{
    local ($i, @chars) = @_;
    local ($j);

    if ($i =~ /^\d*$/)
    {
	print "> ";
	for ($j = $i - 10; $j <= $i + 10; ++$j)
	{
	    print "$chars[$j]";
	}
	print "\n";
    }
    else
    {
	print "> $i";
    }
}

sub DumpState
{
    local (%state) = @_;

    print "##\n";
    foreach $key (keys %state)
    {
	print "# $key: $state{$key}\n";
    }
}

($source) = @ARGV;

$output = "";

&read_map ();

&MainProcessingRoutine ($source, $output);

