doc/doc-docbook/TidyHTML-filter

   1 #! /usr/bin/perl
   2
   3 # $Cambridge: exim/doc/doc-docbook/TidyHTML-filter,v 1.2 2005/11/10 12:30:13 ph10 Exp $
   4
   5 # Script to tidy up the filter HTML file that is generated by xmlto. The
   6 # following changes are made:
   7 #
   8 # 1. Split very long lines.
   9 # 2. Create reverse links from chapter and section titles back to the TOC.
  10 # 3. Turn <div class="literallayout"><p> into <div class="literallayout"> and
  11 #    a matching </p></div> into </div> to get rid of unwanted vertical white
  12 #    space.
  13
  14
  15 $tocref = 1;
  16 $thisdiv = 0;
  17
  18 # Read in the filter.html file.
  19
  20 open(IN, "filter.html") || die "Failed to open filter.html for reading: $!\n";
  21 @text = <IN>;
  22 close(IN);
  23
  24 # Insert a newline after every > because the whole toc is generated as one
  25 # humungous line that is hard to check. Then split the lines so that each one
  26 # is a separate element in the vector.
  27
  28 foreach $line (@text) { $line =~ s/>\s*/>\n/g; }
  29 for ($i = 0; $i < scalar(@text); $i++)
  30   { splice @text, $i, 1, (split /(?<=\n)/, $text[$i]); }
  31
  32 # We want to create reverse links from each chapter and section title back to
  33 # the relevant place in the TOC. Scan the TOC for the relevant entries. Add
  34 # an id to each entry, and create tables that remember the new link ids. We
  35 # detect the start of the TOC by <div class="toc" and the end of the TOC by
  36 # <div class="chapter".
  37
  38 # Skip to start of TOC
  39
  40 for ($i = 0; $i < scalar(@text); $i++)
  41   {
  42   last if $text[$i] =~ /^<div class="toc"/;
  43   }
  44
  45 # Scan the TOC
  46
  47 for (; $i < scalar(@text); $i++)
  48   {
  49   last if $text[$i] =~ /^<div class="chapter"/;
  50   if ($text[$i] =~ /^<a href="(#[^"]+)">/)
  51     {
  52     my($ss) = $1;
  53     my($id) = sprintf "%04d", $tocref++;
  54     $text[$i] =~ s/<a/<a id="toc$id"/;
  55     $backref{"$ss"} = "toc$id";
  56     }
  57   }
  58
  59 # Scan remainder of the document
  60
  61 for (; $i < scalar(@text); $i++)
  62   {
  63   if ($text[$i] eq "<div class=\"literallayout\">\n" && $text[$i+1] eq "<p>\n")
  64     {
  65     $text[++$i] = "";
  66     $thisdiv = 1;
  67     }
  68   elsif ($thisdiv && $text[$i] eq "</p>\n" && $text[$i+1] eq "</div>\n")
  69     {
  70     $text[$i] = "";
  71     $thisdiv = 0;
  72     }
  73   elsif ($text[$i] =~ /^<h[23] /)
  74     {
  75     $i++;
  76     if ($text[$i] =~ /^<a( xmlns="[^"]+")? id="([^"]+)">$/)
  77       {
  78       my($ref) = $backref{"#$2"};
  79       $text[$i++] = "<a$1 href=\"#$ref\" id=\"$2\">\n";
  80       my($temp) = $text[$i];
  81       $text[$i] = $text[$i+1];
  82       $text[++$i] = $temp;
  83       }
  84     }
  85   }
  86
  87 # Write out the revised file
  88
  89 open(OUT, ">filter.html") || die "Failed to open filter.html for writing: $!\n";
  90 print OUT @text;
  91 close(OUT);
  92
  93 # End