doc/doc-docbook/TidyHTML-spec

   1 #! /usr/bin/perl
   2
   3 # $Cambridge: exim/doc/doc-docbook/TidyHTML-spec,v 1.2 2005/11/10 12:30:13 ph10 Exp $
   4
   5 # Script to tidy up the spec HTML files that are generated by xmlto. The
   6 # following changes are made:
   7 #
   8 # 1. Tidy the index.html file by splitting the very long lines.
   9 # 2. Create reverse links from chapter and section titles back to the TOC.
  10 # 3. Tidy the ix01.html file - the actual index - by splitting long lines.
  11 # 4. Insert links from the letter divisions to the top of the Index.
  12 # 5. Turn <div class="literallayout"><p> into <div class="literallayout"> and
  13 #    a matching </p></div> into </div> to get rid of unwanted vertical white
  14 #    space.
  15 # 6. Before each occurrence of </td> insert &nbsp; so that the table's cell
  16 #    is a little bit wider than the text itself.
  17
  18 chdir "spec.html";
  19
  20 $tocref = 1;
  21
  22 # Read in the index.html file. It's really the TOC.
  23
  24 open(IN, "index.html") || die "Failed to open index.html for reading: $!\n";
  25 @toc = <IN>;
  26 close(IN);
  27
  28 # Insert a newline after every > because the whole toc is generated as one
  29 # humungous line that is hard to check. Then split the lines so that each one
  30 # is a separate element in the vector.
  31
  32 foreach $line (@toc) { $line =~ s/>\s*/>\n/g; }
  33 for ($i = 0; $i < scalar(@toc); $i++)
  34   { splice @toc, $i, 1, (split /(?<=\n)/, $toc[$i]); }
  35
  36 # We want to create reverse links from each chapter and section title back to
  37 # the relevant place in the TOC. Scan the TOC for the relevant entries. Add
  38 # an id to each entry, and create tables that remember the file names and the
  39 # new link ids.
  40
  41 foreach $line (@toc)
  42   {
  43   if ($line =~ /^<a href="((?:ch|ix)\d+\.html)(#[^"]+)?">/)
  44     {
  45     my($chix) = $1;
  46     my($ss) = $2;
  47     my($id) = sprintf "%04d", $tocref++;
  48     $line =~ s/<a/<a id="toc$id"/;
  49     $backref{"$chix$ss"} = "toc$id";
  50     push @chlist, $chix;
  51     }
  52   }
  53
  54 # Write out the modified index.html file.
  55
  56 open (OUT, ">index.html") || die "Failed to open index.html for writing: $!\n";
  57 print OUT @toc;
  58 close(OUT);
  59
  60 # Now scan each of the other page files and insert the reverse links. While
  61 # we are at it, we tidy up <div class="literallayout"> by removing unwanted
  62 # paragraph marks, which generate unwanted vertical space. We also insert
  63 # &nbsp; before </td> to push table cells apart from each other.
  64
  65 foreach $file (@chlist)
  66   {
  67   open(IN, "$file") || die "Failed to open $file for reading: $!\n";
  68   @text = <IN>;
  69   close(IN);
  70
  71   # Insert a newline after certain elements, and split the lines so that each
  72   # one is a separate element in the vector. This makes it easier to recognize
  73   # these elements.
  74
  75   foreach $line (@text)
  76     {
  77     $line =~ s/<p>\s*(?!\n)/<p>\n/g;
  78     $line =~ s/<\/p>\s*(?!\n)/<\/p>\n/g;
  79     $line =~ s/<\/div>\s*(?!\n)/<\/div>\n/g;
  80     $line =~ s/<div([^>]*)>(?!\n)/<div$1>\n/g;
  81     }
  82
  83   for ($i = 0; $i < scalar(@text); $i++)
  84     { splice @text, $i, 1, (split /(?<=\n)/, $text[$i]); }
  85
  86   $thisdiv = 0;
  87
  88   for ($i = 0; $i < scalar(@text); $i++)
  89     {
  90     if ($text[$i] =~ /^(.*?)<a( xmlns="[^"]+")? id="([^"]+)"><\/a>(.+?)<\/h(.*)$/)
  91       {
  92       my($pre, $opt, $id, $title, $post) = ($1, $2, $3, $4, $5);
  93
  94       # Section reference
  95       my($ref) = $backref{"$file#$id"};
  96
  97       # If not found, try for a chapter reference
  98       $ref = $backref{"$file"} if !defined $ref;
  99
 100       # Adjust the line
 101       $text[$i]= "$pre<a$opt href=\"index.html#$ref\" id=\"$id\">$title</a></h$post";
 102       }
 103
 104     elsif ($text[$i] eq "<div class=\"literallayout\">\n" && $text[$i+1] eq "<p>\n")
 105       {
 106       $text[++$i] = "";
 107       $thisdiv = 1;
 108       }
 109     elsif ($thisdiv && $text[$i] eq "</p>\n" && $text[$i+1] eq "</div>\n")
 110       {
 111       $text[$i] = "";
 112       $thisdiv = 0;
 113       }
 114     elsif ($text[$i] =~ /^\s*<\/td>/)
 115       {
 116       $text[$i] = "&nbsp;$text[$i]";
 117       }
 118     }
 119
 120   open(OUT, ">$file") || die "Failed to open $file for writing: $!\n";
 121   print OUT @text;
 122   close(OUT);
 123   }
 124
 125 # Now process the ix01.html file
 126
 127 open(IN, "ix01.html") || die "Failed to open ix01.html for reading: $!\n";
 128 @index = <IN>;
 129 close(IN);
 130
 131 # Insert a newline after every > because the whole index is generated as one
 132 # humungous line that is hard to check. Then split the lines so that each one
 133 # is a separate element in the vector.
 134
 135 foreach $line (@index) { $line =~ s/>\s*/>\n/g; }
 136 for ($i = 0; $i < scalar(@index); $i++)
 137   { splice @index, $i, 1, (split /(?<=\n)/, $index[$i]); }
 138
 139 # We want to add a list of letters at the top of the index, and link back
 140 # to them from each letter heading. First find the index title and remember
 141 # where to insert the list of letters.
 142
 143 for ($i = 0; $i < scalar(@index); $i++)
 144   {
 145   if ($index[$i] =~ /^<\/h2>$/)
 146     {
 147     $listindex = $i;
 148     last;
 149     }
 150   }
 151
 152 # Now scan through for the letter headings and build the cross references,
 153 # while also building up the list to insert.
 154
 155 $list = "<h4>\n";
 156 for (; $i < scalar(@index); $i++)
 157   {
 158   if ($index[$i] =~ /^(.)<\/h3>$/)
 159     {
 160     $letter = $1;
 161     $index[$i-1] =~ s/^/<a id="${letter}B" href="#${letter}T">/;
 162     $index[$i] =~ s/$/<\/a>/;
 163     $list .= "<a id=\"${letter}T\" href=\"#${letter}B\"> $letter</a>\n";
 164     }
 165   }
 166
 167 # Now we know which letters we have, we can insert the list.
 168
 169 $list .= "</h4>\n";
 170 splice @index, $listindex, 0, $list;
 171
 172 # Write out the modified index.html file.
 173
 174 open (OUT, ">ix01.html") || die "Failed to open ix01.html for writing: $!\n";
 175 print OUT @index;
 176 close(OUT);
 177
 178
 179 # End