doc/doc-docbook/Pre-xml

   1 #! /usr/bin/perl
   2
   3 # $Cambridge: exim/doc/doc-docbook/Pre-xml,v 1.2 2005/11/10 12:30:13 ph10 Exp $
   4
   5 # Script to pre-process XML input before processing it for various purposes.
   6 # Options specify which transformations are to be done. Monospaced literal
   7 # layout blocks are never touched.
   8
   9 # Changes:
  10
  11 # -abstract: Remove the <abstract> element
  12
  13 # -ascii:    Replace &8230;   (sic, no x) with ...
  14 #            Replace &#x2019; by '
  15 #            Replace &#x201C; by "
  16 #            Replace &#x201D; by "
  17 #            Replace &#x2013; by -
  18 #            Replace &#x2020; by *
  19 #            Replace &#x2021; by **
  20 #            Replace &#x00a0; by a space
  21 #            Replace &#169;   by (c)
  22 #            Put quotes round <literal> text
  23 #            Put quotes round <quote> text
  24 #
  25 # -bookinfo: Remove the <bookinfo> element from the file
  26 #
  27 # -fi:       Replace "fi" by &#xFB01; except when it is in an XML element, or
  28 #            inside a <literal>.
  29 #
  30 # -html:     Certain things are done only for HTML output:
  31 #
  32 #            If <literallayout> is followed by optional # space and then a
  33 #            newline, the space and newline are removed, because otherwise you
  34 #            get a blank line in the HTML output.
  35 #
  36 # -noindex   Remove the XML to generate a Concept and an Options index.
  37 # -oneindex  Ditto, but add XML to generate a single index.
  38
  39
  40
  41 # The function that processes non-literal monospaced text
  42
  43 sub process()
  44 {
  45 my($s) = $_[0];
  46
  47 $s =~ s/fi(?![^<>]*>)/&#xFB01;/g if $ligatures;
  48
  49 if ($ascii)
  50   {
  51   $s =~ s/&#8230;/.../g;
  52   $s =~ s/&#x2019;/'/g;
  53   $s =~ s/&#x201C;/"/g;
  54   $s =~ s/&#x201D;/"/g;
  55   $s =~ s/&#x2013;/-/g;
  56   $s =~ s/&#x2020;/*/g;
  57   $s =~ s/&#x2021;/**/g;
  58   $s =~ s/&#x00a0;/ /g;
  59   $s =~ s/&#169;/(c)/g;
  60   $s =~ s/<quote>/"/g;
  61   $s =~ s/<\/quote>/"/g;
  62   }
  63
  64 $s;
  65 }
  66
  67
  68 # The main program
  69
  70 $abstract  = 0;
  71 $ascii     = 0;
  72 $bookinfo  = 0;
  73 $html      = 0;
  74 $inliteral = 0;
  75 $inliterallayout = 0;
  76 $ligatures = 0;
  77 $madeindex = 0;
  78 $noindex   = 0;
  79 $oneindex  = 0;
  80
  81 foreach $arg (@ARGV)
  82   {
  83   if    ($arg eq "-fi")       { $ligatures = 1; }
  84   elsif ($arg eq "-abstract") { $abstract = 1; }
  85   elsif ($arg eq "-ascii")    { $ascii = 1; }
  86   elsif ($arg eq "-bookinfo") { $bookinfo = 1; }
  87   elsif ($arg eq "-html")     { $html = 1; }
  88   elsif ($arg eq "-noindex")  { $noindex = 1; }
  89   elsif ($arg eq "-oneindex") { $oneindex = 1; }
  90   else  { die "** Pre-xml: Unknown option \"$arg\"\n"; }
  91   }
  92
  93 while (<STDIN>)
  94   {
  95   # Remove <abstract> if required
  96
  97   next if ($abstract && /^\s*<abstract>/);
  98
  99   # Remove <bookinfo> if required
 100
 101   if ($bookinfo && /^<bookinfo/)
 102     {
 103     while (<STDIN>) { last if /^<\/bookinfo/; }
 104     next;
 105     }
 106
 107   # Copy monospaced literallayout blocks
 108
 109   if (/^<literallayout class="monospaced">/)
 110     {
 111     $_ = substr($_, 0, -1) if $html;
 112     print;
 113     while (<STDIN>)
 114       {
 115       print;
 116       last if /^<\/literallayout>/;
 117       }
 118     next;
 119     }
 120
 121   # Adjust index-generation code if required
 122
 123   if (($noindex || $oneindex) && /^<index[\s>]/)
 124     {
 125     while (<STDIN>)
 126       {
 127       last if /^<\/index>/;
 128       }
 129
 130     if ($oneindex && !$madeindex)
 131       {
 132       $madeindex = 1;
 133       print "<index><title>Index</title></index>\n";
 134       }
 135
 136     next;
 137     }
 138
 139   # A line that is not in a monospaced literal block; keep track of which
 140   # parts are in <literal> and which not. The latter get processed by the
 141   # function above. Items in <literal> get quoted unless they are also in
 142   # a <literallayout> block, or are already being quoted.
 143
 144   for (;;)
 145     {
 146     $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/;
 147     $inliterallayout = 1 if /^<literallayout/;
 148     $inliterallayout = 0 if /^<\/literallayout/;
 149
 150     if ($inliteral)
 151       {
 152       if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/)
 153         {
 154         print $1;
 155         print "\"" if $ascii && !$inliterallayout;
 156         print "</literal>";
 157         $inliteral = 0;
 158         $_ = "$2\n";
 159         }
 160       else
 161         {
 162         print;
 163         last;
 164         }
 165       }
 166
 167     # Not in literal state
 168
 169     else
 170       {
 171       if (/^(.*?)(?<!<quote>)<literal>(.*)$/)
 172         {
 173         print &process($1);
 174         print "<literal>";
 175         print "\"" if $ascii && !$inliterallayout;
 176         $inliteral = 1;
 177         $_ = "$2\n";
 178         }
 179       else
 180         {
 181         print &process($_);
 182         last;
 183         }
 184       }
 185     }    # Loop for different parts of one line
 186   }      # Loop for multiple lines
 187
 188 # End