#
############################################################
# MODULE:    MARC record to tag-valid SGML converter
# VERSION:   1.0
# DATE:      November 17, 1997
#
# MULBERRY INTERNAL VERSION CONTROL:
# $Id: mrc2sgm.pl,v 1.14 1997/11/26 15:00:49 tkg Exp $
############################################################

############################################################
# SYSTEM:    MARC to tag-valid SGML converter set
#
# PURPOSE:   Convert MARC records to tag-valid SGML
#
# CONTAINS:  1) "Use" statements for external packages
#            2) Declarations of constants
#            3) Declarations of global variables
#            4) Command-line argument processing
#            5) Initialization of log file and output file
#            6) MARC Description File processing
#            7) Declaration of handlers for processing input file
#            8) Input file processing
#            9) End processing
#            10) Subroutines
#
# PACKAGES REQUIRED:
#            1) David Megginson's SGMLS and SGMLS::Output packages
#            2) sgmlspl package
#            3) marcdesc.pl MARC Description File handler
#
# CREATED FOR:
#            Network Development and MARC Standards Office
#            The Library of Congress
#
# ORIGINAL CREATION DATE:
#            November 1997
#
# CREATED BY:
#            Mulberry Technologies, Inc.
#            17 West Jefferson Street, Suite 207
#            Rockville, MD  20850
#            Phone:  301/315-9631
#            Fax:    301/315-8285
#            e-mail: info@mulberrytech.com
#            WWW:    http://www.mulberrytech.com
############################################################

############################################################
# External packages

# David Megginson's SGMLpm package
use SGMLS;
use SGMLS::Output;

# Object-oriented version of David Megginson's SGMLSpl.pl
use SGMLSpl;

# Routines for handling MARC Description Files
require "Marcconv/marcdesc.pl";

############################################################
# Constants

# Size of chunk of input file read each time
$cChunkSize = 3000;

# Magic strings to indicate type of control field
$cNoIndicatorsOrSubfields = 'noindsf';
$cPositionallyDefinedField = 'posdef';

# Constant strings for significant MARC characters
#
# End of record delimiter
$cMarcEOR = '\035';
# End of field delimiter
$cMarcEOF = '\036';
# End of subfield delimiter
$cMarcEOS = '\037';
# Blank character
$cMarcBlank = ' ';
# Blank symbol used in MARC Description File
$cMarcBlankSymbol = '#';
# Fill character
$cMarcFill = '\|';

# Magic string to indicate what to use as key for selecting clusters
# for positionally-defined field
$cLeader0607Key = 'LEADER0607';

# Where to find the MARC Description File DTD
$cMarcDescDtd = &FindMarcConvFile('marcdesc.dtd');
# Where to find the MARC Description File
$cMarcDescriptionFile = &FindMarcConvFile('marcdesc.sgm');

# Constants for character conversion
# Magic strings to both determine conversion type and to output
# to log file to indicate conversion type
$cSGMLConversion = 'SGML';
$cRegisterConversion = 'Upper register to lower register';
$cCharacterConversion = 'Character conversion';
$cUserDefinedConversion = 'User-defined';

# Where to find the default conversion specification file for each
# type of conversion
#
# The DTD for character-entity conversion files
$cConversionDtdFile = &FindMarcConvFile('entmap.dtd');
# Specification file for upper-register to entity conversion
$cRegisterConversionFile = &FindMarcConvFile('register.sgm');
# Specification file for non-ASCII character to entity conversion
$cCharacterConversionFile = &FindMarcConvFile('charconv.sgm');

# Usage message
# This is output if the user gets the command parameters wrong
$cUsage = <<EndOfUsage;
mrc2sgm.pl [-command file]
   [-sgmlconv | -registerconv | -charconv | -userconv file]
   [-log file] [-o file] [-marcdesc file] [-help] input-file
EndOfUsage

# Help message
# This is output if the user specifies the -help parameter
$cHelp = <<EndOfHelp;
mrc2sgm.pl [-command file]
   [-sgmlconv | -registerconv | -charconv | -userconv file]
   [-log file] [-o file] [-marcdesc file] [-help] input-file

where:
-command file
        Read program command options from "file".

-sgmlconv
        Perform minimal, "SGML sanity" character conversion using the
        built-in conversion table

-registerconv
        Convert upper-register characters to lower-register characters
        using the built-in conversion table.

        The minimal SGML conversion will also be performed.

-charconv
        Convert characters to entities using the built-in conversion
        table

        The minimal SGML conversion will also be performed.

-userconv file
        Perform character conversion using the user-supplied
        conversion specification in "file"

        An error will be signaled if "file" is not specified or if
        "file" cannot be opened, or if "file" is not a file of the
        correct format.

        The minimal SGML conversion will also be performed.

-log file
        Write the output log to "file".  If this option is not
        specified, the log will be written to "mrc2sgm.log" in the
        current directory.

-o file
        Write the unvalidated SGML output to "file" instead of to
        the default file "stdout.sgm".

-marcdesc file
        Read the MARC Description File named "file" instead of the
        default MARC Description File that the program automatically
        reads on initialization.

-help
        Print this help information then quit.

input-file
        The name of the input MARC record file
EndOfHelp

############################################################
# Global variables

# SGML file with "record type" data
$gMarcDescFile = $cMarcDescriptionFile;

# Log file name
$gLogFile = 'mrc2sgm.log';

# Count of records processed
$gRecordCount = 0;

# Count of records converted
$gConvertCount = 0;

# Count of records skipped
$gSkipCount = 0;

# Output file handle
$gOutputFile = 'stdout.sgm';

# Conversion specification count
$gConversionSpecCount = 0;

# Entity conversion map file
$gEntityConversionFile = '';

############################################################
# Process command-line arguments

# First check for a "-command  file" argument, find the command file,
# parse it for commands, then prepend the commands to the argument
# list in @ARGV
for($lArgCount = 0; $lArgCount <= $#ARGV; $lArgCount++) {

    if ($ARGV[$lArgCount] =~ /^-command$/) {
	local($lJunk, $lCommandFile) = splice(@ARGV, $lArgCount, 2);
	local($lCommandData, @lCommandData) = ();

#	print STDERR ":$lJunk:$lCommandFile:\n";

	open(COMMANDFILE, "$lCommandFile") ||
	    die "Couldn't open \"$lCommandFile\" as command file.\n";

	# We want the command file name later
	$gCommandFile = $lCommandFile;

	while (<COMMANDFILE>) {

	    # Allow comments in lines beginning with "#"
	    next if /^#/;

	    # Tidy up the line before we split on whitespace
	    chomp;
	    s/^\s+//;
	    s/\s+$//;

	    push(@lCommandData, split(/\s+/));
	}

	# Put the arguments from the command file *before* the command line
	# arguments
	unshift(@ARGV, @lCommandData);

	last;
    }
}
	
# Process the command line (and command file) arguments
while (@ARGV) {
    if ($ARGV[0] =~ /^-/) {
	if ($ARGV[0] =~ /^-command$/) {
	    warn "\"-command\" argument may only be used once.\n";
	    die $cUsage;
	} elsif ($ARGV[0] =~ /^-marcdesc$/) {
	    shift;
	    $gMarcDescFile = shift;
	} elsif ($ARGV[0] =~ /^-log$/) {
	    shift;
	    $gLogFile = shift;
	} elsif ($ARGV[0] =~ /^-o$/) {
	    shift;
	    $gOutputFile = shift;
	} elsif ($ARGV[0] =~ /^-sgmlconv$/) {
	    shift;
	    $gConversionType = $cSGMLConversion;
	    $gConversionSpecFile = '';
	    $gConversionSpecCount++;
	} elsif ($ARGV[0] =~ /^-registerconv$/) {
	    shift;
	    $gConversionType = $cRegisterConversion;
	    $gConversionSpecFile = $cRegisterConversionFile;
	    $gConversionSpecCount++;
	} elsif ($ARGV[0] =~ /^-charconv$/) {
	    shift;
	    $gConversionType = $cCharacterConversion;
	    $gConversionSpecFile = $cCharacterConversionFile;
	    $gConversionSpecCount++;
	} elsif ($ARGV[0] =~ /^-userconv$/) {
	    shift;
	    $gConversionSpecFile = shift;
	    $gConversionType = $cUserDefinedConversion;
	    $gConversionSpecCount++;
	} elsif ($ARGV[0] =~ /^-help$/) {
	    die $cHelp;
	} else {
	    warn "Unknown option \"$ARGV[0]\".\n\n";
	    die $cUsage;
	}
    } else {
	last;
    }
}

############################################################
# Open log file before we get any further

open(LOGFILE, ">$gLogFile") ||
    warn "Couldn't open \"$gLogFile\" as log file." .
        " Continuing without log.\n";

&LogOpenMessage();

############################################################
# Work out what is the input file
if (!@ARGV) {
    warn "An input file must be specified.\n\n";
    die $cUsage;
} elsif (@ARGV > 1) {
    warn "Only one input file can be specified." .
	"  \"" . join(" ", @ARGV) . "\" is too many file names.\n\n";
    die $cUsage;
} else {
    $gInputFile = shift;
}

############################################################
# Open output file

if ($gOutputFile ne '') {
    open(OUTFILE, ">$gOutputFile") ||
	die "Couldn't open \"$gOutputFile\" as output file.\n";
} else {
    *OUTFILE = STDOUT;

    $gOutputFile = "STDOUT";
}

############################################################
# Work out the conversion type and evaluate the conversion
# specification file
if ($gConversionSpecCount > 1) {
    warn "Only one conversion specification is allowed.\n\n";
    die $cUsage;
} elsif ($gConversionSpecCount == 0) {
    $gConversionType = $cSGMLConversion;
} elsif ($gConversionType eq $cRegisterConversion ||
	 $gConversionType eq $cCharacterConversion ||
	 $gConversionType eq $cUserDefinedConversion) {
    &EvalConversion($gConversionSpecFile);
}

############################################################
# Write what we know about the processing to the log file
$gCommandLineArguments = <<EndOfArguments;
------------------------------------------------------------
Command File:          $gCommandFile
MARC Description File: $gMarcDescFile
Input File:            $gInputFile
Output File:           $gOutputFile
Character Conversion:  $gConversionType
Conversion File:       $gConversionSpecFile
Log File:              $gLogFile
------------------------------------------------------------
EndOfArguments

&Log($gCommandLineArguments);

############################################################
# Process the Marc Description File

open (MARCDESC, "nsgmls $cMarcDescDtd $gMarcDescFile |");
SGMLSPL::Process($gMarcDesc, MARCDESC);

# Uncomment this subroutine call to dump the data structures for the
# MARC Description File to STDOUT
# &DumpMarcDesc();

############################################################
# Read the input file one chunk at a time.
$gChunk = '';

open (INFILE, $gInputFile) ||
    die "Cannot open input file \"$gInputFile\".\n";

while (!eof INFILE) {
    # Read a chunk, appending it to $gChunk
    read INFILE, $gChunk, $cChunkSize, length($gChunk);

    # Process any MARC records in what we've got so far
    while ($gChunk =~ /$cMarcEOR/o) {
	# The record is everything up to the EOR
	local($lRecord) = $`;

	$gRecordCount++;

	# Set $gChunk to everything after the EOR
	$gChunk = $';

#	print "$gChunkCount:$gRecordCount:" . length($gChunk) . "\n";

	# Process the record
	&ProcessRecord($lRecord);
	# Then go around the "while" loop again to see if there's more
	# records in the current chunk
    }
}

# The file should end on an EOR, so complain if there's junk
if (length($gChunk) != 0) {
    &Log("Extraneous characters at end of file after $gRecordCount records.");
}

# print "$gChunkCount:$gRecordCount:" . length($gChunk) . "\n";

&Log("End of input file reached.");
&Log("Records processed: $gRecordCount");
&Log("Records converted: $gConvertCount");
&Log("Records skipped: $gSkipCount");
&LogCloseMessage();
close(LOGFILE);

############################################################
# &ProcessRecord($pRecord)
# Process a single MARC record
sub ProcessRecord {
    local($pRecord) = @_;
    local(@lFields);

    # We got our MARC record by matching on the EOR, but the character
    # before the EOR should be an EOF.
    if (!$pRecord =~ /$cMarcEOF$/o) {
	&Log("Record $gRecordCount does not end with EOF, EOR sequence." .
	     "  Skipping record.\n");
	$gSkipCount++;
	return;
    }

    # Split the field on EOF characters
    @lFields = split(/$cMarcEOF/o, $pRecord);

    # First "field" is leader + directory
    $lLeaderAndDirectory = shift @lFields;

    # Remove the Leader, the Directory, and the EOF after the Directory
    # from $pRecord.  What's left has the right character offsets for
    # the character offsets in the Directory's fields.
    $pRecord = substr $pRecord, length($lLeaderAndDirectory) + 1;

    # The Leader is the first 24 characters of the Leader and Directory
    $lLeader = substr $lLeaderAndDirectory, 0, 24;
    # And the Directory comes after the Leader
    $lDirectory = substr $lLeaderAndDirectory, 24;

    unshift @lFields, $lLeader, $lDirectory;

    # Leader characters 06 and 07 are reused in several places
    $lLeader0607 = substr $lLeader, 6, 2;
    # But when we use it, we want the blank symbol, "#", not the character
    $lLeader0607 =~ s/$cMarcBlank/$cMarcBlankSymbol/go;

    # The Directory should be divisible by 12 since the fields in the
    # Directory are each 12 characters
    if (length($lDirectory) % 12 != 0) {
	&Log("Record $gRecordCount directory length is \"" .
	     length($lDirectory) .
	     "\", which is not divisible by 12.  Skipping record.\n");
	$gSkipCount++;
	return;
    }

    # The record is not much use if we can't make sense of the Leader
    # 06 and 07 data
    unless (defined($gLdrToDoctype{$lLeader0607})) {
	&Log("Invalid Leader cp 06-07 in record $gRecordCount;" .
	     "  record not converted.\n");
	$gSkipCount++;
	return;
    }

    # Get the doctype and the format type from the information from
    # the MARC Description File based on Leader 06 and 07 data
    $gDoctype = $gLdrToDoctype{$lLeader0607}->doctype;
    $gFormatType = $gLdrToDoctype{$lLeader0607}->format_type;

    # Print the wrapping start tag
    print OUTFILE "<$gDoctype format-type=\"$gFormatType\">\n";

    &ProcessLeader($gDoctype, $gFormatType, $lLeader);

    # Process the Directory, and, indirectly, the fields in the record
    &ProcessDirectory($gDoctype, $lLeader0607, $lDirectory);

    # Print the wrapping end tag
    print OUTFILE "</$gDoctype>\n";

    # The number of records converted is output to the log file at the end
    # We don't increment this until now because many things could have
    # gone wrong in process the Leader and the Directory
    $gConvertCount++;
}

############################################################
# &ProcessLeader($pDoctype, $pFormatType, $pDirectory)
# Process the Leader portion of a MARC record
sub ProcessLeader {
    local($pDoctype, $pFormatType, $pLeader) = @_;
    # The Leader 06 value is used as the key when selecting clusters
    # into which the Leader data is divided.
    local($lLeader06) = substr $pLeader, 6, 1;

#    print "Leader:$pLeader:\n";

    # Output the wrapping start tag for the leader
    print OUTFILE "<${pDoctype}ldr-$pFormatType>\n";

    # For each cluster, extract the designated character positions from
    # the Leader, convert clusters containing only blank characters or
    # only fill characters to the words "blank" or "fill", respectively,
    # then apply the current character to entity conversion and
    # output the tag for the element
    foreach $lCluster (@{$gLdrClusters{$lLeader06}}) {
	local($lStartCP, $lEndCP, $lData) = ();

	# Clusters may be a single character or a range of characters
	$lCluster =~ /^(\d+)(-(\d+))?$/;
	$lStartCP = $1;
	$lEndCP = $2;

	# If it's not a range, then the end character is the same as
	# the start character
	$lEndCP = $lStartCP unless $lEndCP;

	# Extract the cluster data from the Leader
	$lData = substr $pLeader, $lStartCP, ($lEndCP - $lStartCP + 1);

	# Replace sequences of blanks or fills with the word
	$lData = 'blank' if $lData =~ /^$cMarcBlank+$/o;
	$lData = 'fill' if $lData =~ /^$cMarcFill+$/o;

	# Use the current conversion routine to turn specific characters
	# or sequences of characters into SGML entities
	$lData = &ConversionRoutine($lData);

	# Print the tag for the cluster.  These elements are EMPTY and
	# don't have an end tag
	print OUTFILE "<${pDoctype}ldr-$pFormatType-$lStartCP value=\"$lData\">\n";
    }

    # Print the end tag for the Leader
    print OUTFILE "</${pDoctype}ldr-$pFormatType>\n";
}

############################################################
# &ProcessDirectory($pDoctype, $pLeader0607, $pDirectory)
# Process the directory portion of a MARC record
sub ProcessDirectory {
    local($pDoctype, $pLeader0607, $pDirectory) = @_;
    local($lDirectoryEntry);
    local($lField) = '';
    local(%lFieldData) = ();
    local($lGroups);
    local($lGroupStart, @lGroupStarts, @lMarcTags, %lGroups);

    # The Directory is a sequence of 12-character entries
    while ($lDirectoryEntry = substr($pDirectory, 0, 12)) {

	# Drop the 12 characters for the current entry
	$pDirectory = substr($pDirectory, 12);

	# The directory entry comprises three characters for the
	# MARC field tag, four characters for the field's length,
	# and five characters for the offset of the starting
	# position of the field's data
	($lMarcTag, $lFieldLength, $lStartingPosition) =
	    unpack("A3A4A5", $lDirectoryEntry);

	# Subtract 1 from the field length when we get the field
	# so we don't get the EOF mark
	$lField = substr($pRecord, $lStartingPosition, $lFieldLength - 1);
#	print ":$lMarcTag:$lFieldLength:$lStartingPosition:$lField:\n";

	# The same number field can occur multiple times within a MARC
	# record, so make an array of field data for each field
	push(@{$lFieldData{$lMarcTag}}, $lField);
    }

    # At this point we have all of the field data, but the fields will
    # be output in groups.  We get the groups from the MARC Description
    # File, but we need to sort the fields in the current record into
    # the appropriate groups.  Not all groups will be always be present.
    # We couldn't just use the sequence of the elements in the Directory
    # since the sequence is only guaranteed for the 100s digit, and the
    # groups don't necessarily all break at a multiple of 100.

    # Sort the field numbers
    @lMarcTags = sort(keys(%lFieldData));
    # Sort the field numbers of the fields that start each group
    @lGroupStarts = sort(keys(%{$gDtdGroups{$pDoctype}}));

    # For each group, in order of its starting field number, make a list
    # of fields in the current MARC record that fall within the range
    # of field numbers for the group
  GROUPSTART:
    foreach $lGroupStart (@lGroupStarts) {
	local($lGroupEnd) = ${$gDtdGroups{$pDoctype}}{$lGroupStart}->end;
        # If there's no end field number, it's a group of one field number
        # and the end field number is the same as the start field number
        $lGroupEnd = $lGroupStart unless $lGroupEnd;

        # While we have fields from the current record, see if they are
        # within the range for the current group
	while ($lMarcTag = shift(@lMarcTags)) {

	    if ($lMarcTag >= $lGroupStart && $lMarcTag <= $lGroupEnd) {
		# If we're within the range for the group, save the field
		# number as being within the current group
		push(@{$lGroups{$lGroupStart}}, $lMarcTag);
	    } else {
		# If the field isn't within the range, put it back on the
		# front of the list and try again with the next group
		unshift(@lMarcTags, $lMarcTag);
		next GROUPSTART;
	    }
	}
    }   

# Uncomment this for debugging
#    foreach (sort(keys(%lGroups))) {
#	print ":$_:\n";
#	print (@{$lGroups{$_}}) . ":\n";
#    }

    # At this point, we have the field numbers assigned to groups.
    # We will output the group tags and the fields for each group
    foreach $lGroupStart (sort(keys(%lGroups))) {
	local($lGroupLabel) = ${$gDtdGroups{$pDoctype}}{$lGroupStart}->label;

        print OUTFILE "<$lGroupLabel>\n";

	# For each field number for the current group, work out if
	# we're a data variable field, a positionally-defined field,
	# or a field with no indicators or subfields, then process
	# the field data accordingly
        foreach $lMarcTag (@{$lGroups{$lGroupStart}}) {
#	    local($lField) = $lFieldData{$lMarcTag};

	    # If it's not a control field, then it must be a data
	    # variable field
	    unless ($gControlFields{$lMarcTag}) {
		foreach $lFieldData (@{$lFieldData{$lMarcTag}}) {
		    &ProcessDataVariableField($pDoctype,
					      $lMarcTag,
					      $lFieldData);
		}
	    } elsif ($gControlFields{$lMarcTag}->field_type
		     =~ /$cPositionallyDefinedField/o) {
		# Control fields are either positionally-defined or
		# they're not.
		local($lKeyData) = '';

#		print "$lMarcTag is positionally-defined.\n";

		# For each occurrence of the field in the current MARC
		# record, we find the key for selecting the clusters,
		# then we process the positionally-defined data.
		foreach $lFieldData (@{$lFieldData{$lMarcTag}}) {

		    # The choice of clusters for positionally-defined
		    # fields is keyed from either Leader 06 and 07 data
		    # or from the first character in the field's data.
		    # The MARC Description File specifies which it is for
		    # each positionally-defined field, and we find our
		    # key data accordingly.
		    if ($gControlFields{$lMarcTag}->key
			=~ /^$cLeader0607Key$/o) {
			$lKeyData = $pLeader0607;
		    } else {
			$lKeyData = substr $lFieldData, 0, 1;
		    }

		    &ProcessPositionallyDefinedField($pDoctype,
						     $lMarcTag,
						     $lKeyData,
						     $lFieldData);
		}
	    } else {
		# If it is a control field and its not positionally-defined,
		# then it must be a field without indicators or subfields.

#		print "$lMarcTag has no indicators or subfields.\n";

		foreach $lFieldData (@{$lFieldData{$lMarcTag}}) {
		    &ProcessNoIndicatorOrSubfieldField($pDoctype,
						       $lMarcTag,
						       $lFieldData);
		}
	    }
	}

	# Print the tag for the end of the group
        print OUTFILE "</$lGroupLabel>\n";
   }
}

############################################################
# &ProcessPositionallyDefinedField($pDoctype, $pMarcTag,
#                                  $pKeyData, $pField)
# Process a positionally-defined field
sub ProcessPositionallyDefinedField {
    local($pDoctype, $pMarcTag, $pKeyData, $pField) = @_;
    # The subtype, which is output as part of the SGML tags, is
    # determined by the key data that also selects the cluster
    # arrangement
    local($lSubtype) = $gControlFields{$pMarcTag}->
	clusters->{$pKeyData}->subtype;

    print OUTFILE "<$pDoctype$pMarcTag-$lSubtype>\n";

    # For each cluster, where the cluster groups are selected by
    # the key data from the information in the MARC Description File,
    # extract the corresponding character positions' data from the
    # field, replace blank or fill fields with words, then output
    # the start tag (but no end tag, since this is an EMPTY element)
    # for the cluster
    foreach $lCluster (@{$gControlFields{$pMarcTag}->
			     clusters->{$pKeyData}->clusters}) {
	local($lStartCP, $lEndCP, $lData) = ();
	local($lEndCPString) = "";

	# The cluster may be a single character or a range of characters
	$lCluster =~ /^(\d+)(-(\d+))?$/;
	$lStartCP = $1;
	$lEndCP = $3;

	# If it's a range, the end character number is included in the
	# SGML tag
	if ($lEndCP ne '') {
	    $lEndCPString = "-$lEndCP";
	} else {
	    # If it's not a range, the end character position is the
	    # same as the start character position, and the tag does
	    # not indicate a range
	    $lEndCP = $lStartCP;
	    $lEndCPString = '';
	}

	# Extract the data.  If the start character is the same as
	# the end character, we get one character, and if it's a range,
	# we get the right number of characters.
	$lData = substr $pField, $lStartCP, ($lEndCP - $lStartCP + 1);

	# Replace clusters containing only fills or blanks with the
	# appropriate word
	$lData = 'blank' if $lData =~ /^$cMarcBlank+$/o;
	$lData = 'fill' if $lData =~ /^$cMarcFill+$/o;

#	print ":$pField:$lStartCP:$lEndCP:$lData:\n";

	# Output the tag for the cluster
	print OUTFILE "<$pDoctype$pMarcTag-$lSubtype-$lStartCP$lEndCPString" .
	    " value=\"$lData\">\n";
    }

    # Output the end tag for the positionally-defined field
    print OUTFILE "</$pDoctype$pMarcTag-$lSubtype>\n";
}

############################################################
# &ProcessNoIndicatorOrSubfieldField($pDoctype, $lMarcTag, $pField)
# Process a control field that does not have indicators or subfields
# These fields are easy:  just output the start tag, the data, and
# the end tag.
sub ProcessNoIndicatorOrSubfieldField {
    local($pDoctype, $pMarcTag, $pField) = @_;

#    print "$pField\n";

    print OUTFILE "<$pDoctype$pMarcTag>";

    # Output the data, suitably converted of course.
    print OUTFILE &ConversionRoutine($pField);

    print OUTFILE "</$pDoctype$pMarcTag>\n";
}

############################################################
# &ProcessDataVariableField($pDoctype, $lMarcTag, $pField)
# Process a data variable field
sub ProcessDataVariableField {
    local($pDoctype, $pMarcTag, $pField) = @_;
    # Split the field data into subfields
    local(@lSubfields) = split /$cMarcEOS/o, $pField;
    # The Indicators are the first two characters of the field,
    # and since they were followed by an EOS, they appeared as the first
    # subfield when we split the field into subfields.
    local($lIndicator1, $lIndicator2) = unpack "aa", shift(@lSubfields);

#    print "$pField\n";

#    print "Indicator1:$lIndicator1:Indicator2:$lIndicator2:\n";
#    print @lSubfields;

    # Output the STAGO and GI for the variable field
    print OUTFILE "<$pDoctype$pMarcTag";

    # Process and output the first indicator
    print OUTFILE " i1=\"i1-";

    if ($lIndicator1 =~ /^$cMarcBlank$/o) {
	print OUTFILE "blank";
    } elsif ($lIndicator1 =~ /^$cMarcFill$/o) {
	print OUTFILE "fill";
    } else {
	print OUTFILE $lIndicator1;
    }

    # Close the first indicator's attribute
    print OUTFILE "\"";

    # Process and output the second indicator
    print OUTFILE " i2=\"i2-";

    if ($lIndicator2 =~ /^$cMarcBlank$/o) {
	print OUTFILE "blank";
    } elsif ($lIndicator2 =~ /^$cMarcFill$/o) {
	print OUTFILE "fill";
    } else {
	print OUTFILE $lIndicator2;
    }

    # Close the second indicator's attribute and the field's start tag
    print OUTFILE "\">\n";

    # For each subfield within the data variable field, output the
    # subfield's start tag, its contents, and its end tag.
    foreach $lSubfield (@lSubfields) {
	# The first character within the subfield is its identifier.
	# Save the identifier and remove it from the subfield data
	$lIdentifier = substr($lSubfield, 0, 1);
	$lSubfield = substr($lSubfield, 1);

#	print "$lIdentifier:$lSubfield:\n";

	print OUTFILE "<$pDoctype$pMarcTag-$lIdentifier>";

	print OUTFILE &ConversionRoutine($lSubfield);

	print OUTFILE "</$pDoctype$pMarcTag-$lIdentifier>\n";
    }

    # Output the end tag for the data variable field
    print OUTFILE "</$pDoctype$pMarcTag>\n";
}

############################################################
# &ConversionRoutine($_)
# This is the default conversion routine that simply converts a few
# characters that are significant to SGML.  We do not convert "&"
# because the spec said not to.  If the user specifies a conversion on
# the command line, then the contents of the conversion specification
# file will be evaluated as a "&ConversionRoutine" subroutine,
# replacing this one.
sub ConversionRoutine {
    local($_) = @_;

    s/</\&lt;/g;

    # Joiner and non-joiner are control characters in the C1 range
    # that most SGML Declarations would declare non-SGML
    s/\x8D/\&joiner;/g;
    s/\x8E/\&nonjoinr;/g;

    return $_;
}

############################################################
# &EvalConversion($pConversionFile)
# Read $pConversionFile and make the entity conversion subroutine.
# The conversion subroutine will replace the default "&ConversionRoutine"
# because it is evaluated after the built-in subroutine.
sub EvalConversion {
    local($pConversionFile) = @_;

    # Create a new SGMLSPL object.
    $gEntityMap = new SGMLSPL;

    %gCharacterToEntity = ();

    # Setup the functions to perform for each of the elements in the
    # DTD used for the character-to-entity mapping.  When the conversion
    # file is processed, the anonymous subroutines that are the last
    # argument in each of these subroutine calls will be evaluated as
    # the corresponding start or end tag is processed.

    # For <desc> start tags, save the output
    SGMLSPL::sgml($gEntityMap, '<DESC>', sub {
	push_output 'string';
    });

    # For <desc> end tags, "unsave" the output but don't do anything
    # with it.  In effect, drop the character data content of the
    # <desc> element.
    SGMLSPL::sgml($gEntityMap, '</DESC>', sub {
	pop_output;
    });

    # For <entity> start tags, save the character data content
    SGMLSPL::sgml($gEntityMap, '<ENTITY>', sub {
	push_output 'string';
    });

    # For <entity> end tags, use the saved character data into a
    # hash keyed on the value of the "hex" attribute of the
    # parent <character> element, unless we've already defined
    # a value for that key.
    SGMLSPL::sgml($gEntityMap, '</ENTITY>', sub {
	my $lElement = shift;
	my $lEntityName = pop_output;

	$gCharacterToEntity{$lElement->parent->attribute(HEX)->value} =
	    $lEntityName
		unless
		    defined($gCharacterToEntity{$lElement
						    ->parent
							->attribute(HEX)
							    ->value});
    });

    # Now that we've set up how to process the conversion file,
    # parse the file using nsgmls and process the ESIS output of nsgmls
    open (ENTITYMAP, "nsgmls $cConversionDtdFile $pConversionFile |");
    SGMLSPL::Process($gEntityMap, ENTITYMAP);

    # At this point, we have a hash associating hexadecimal character
    # values with replacement entities.  We now construct the Perl
    # expressions for a subroutine containing a sequence of substitution
    # expressions, then evaluate the string to create a new
    # "&ConversionRoutine" expression.
    $lConversionSubroutine = <<EndOfSubroutineStart;
sub ConversionRoutine {
    local(\$_) = \@_;
    study;
EndOfSubroutineStart

    # Create the substitution expressions in reverse order to get the
    # the longest string first.  Otherwise, some of the compound
    # characters would never be created.  For example, 0xA2 -> &Ostrok,
    # 0xE2 -> &acute;, and 0xE2 0xA2 -> Ostroka.  When evaluated in
    # reverse numeric order, 0xE2 0xA2 comes before either 0xE2 or
    # 0xA2, so we substitute the entity for the compound character, not
    # the two entities for the components of the compound character.
    foreach $lCharacter (reverse(sort(keys(%gCharacterToEntity)))) {
	$lConversionSubroutine .=
	    "    s/\\x" . join('\x', split(/\s+/, $lCharacter)) . '/\&' .
		$gCharacterToEntity{$lCharacter} . ";/g;\n";
    }

    # We always convert a few signficant characters
    $lConversionSubroutine .= <<EndOfSubroutineEnd;
    s/</\&lt;/g;
    s/\x8D/\&joiner;/g;
    s/\x8E/\&nonjoinr;/g;

    return(\$_);
}
EndOfSubroutineEnd

    # Evaluate the string that we just put together and create the new
    # "&ConversionRoutine" subroutine
    eval($lConversionSubroutine);
    warn $@ if $@;

#    print "Defined\n" if defined(&ConversionRoutine);
}

############################################################
# &FindMarcConvFile($pFile)
# Find the specified file by searching under the directories
# in @INC.  The built-in MARC Description File and entity-to-character
# conversion specification files should be installed in a "Marcconv"
# directory under Perl's "lib" directory.  Not surprisingly, Perl's
# "lib" directory should be in the list of directories in @INC.
sub FindMarcConvFile {
    my($pFile) = shift;

    foreach $lPrefix (@INC) {
	local($lRealFile) = "$lPrefix/Marcconv/$pFile";

	# Return the full pathname of the file if we find it
	return $lRealFile if -f $lRealFile;
    }

    # Warn about not finding the file if we can't find it, and return
    # an empty string.
    warn "Could not locate required file \"$pFile\".\n";
    &Log("Could not locate required file \"$pFile\".");
    return '';
}

############################################################
# Log file and error reporting routines
############################################################

############################################################
# &LogOpenMessage()
# Write the opening message to the log
sub LogOpenMessage {
    &Log("$0 started at " . localtime(time) . "\n");
}

############################################################
# &LogCloseMessage()
# Write the closing message to the log
sub LogCloseMessage {
    &Log("$0 ended at " . localtime(time) . "\n");
}

############################################################
# &Log($pMessage)
# Write $pMessage to the log file
sub Log {
    local($pMessage) = @_;

    # Remove any newlines at the end of the message because we
    # know we are going to add one when we output the message.
    chomp($pMessage);

    print LOGFILE $pMessage . "\n";
}

