doc/doc-scripts/fc2k

   1 #! /usr/bin/perl -w
   2 # $Cambridge: exim/doc/doc-scripts/fc2k,v 1.2 2004/10/14 09:53:11 ph10 Exp $
   3
   4 # Script to read the HTML table of contents for the Exim FAQ and create an
   5 # HTML KWIC index out of it.
   6
   7
   8 ########################################################################
   9 # List of words to ignore - kept alphabetically for reference, but they
  10 # don't have to be in order.
  11
  12 $ignore_list = "
  13
  14 a ability able about absence access according actual address addresses addressed
  15 affect affected after against aka all allow allowed allows along already also
  16 although always am amount an ancient and and/or annoying another any anybody
  17 anyone anything anywhere apparent apparently are aren't around arrange arrive
  18 arrives as at
  19
  20 back bad based basically be because been behave behaviour being best between
  21 bob both box bug build builds built busy but by
  22
  23 call called calls can can't cannot causes causing central certain code comes
  24 coming command commands complain complaining complains configure configured
  25 conjunction contact contain contains contained correct correctly could
  26 currently customer
  27
  28 day days defined deliver delivers delivered delivery deliveries did do does
  29 doesn't doing don't down during
  30
  31 e-mail e-mails each easy either else email emails entirely entries entry
  32 especially etc even ever every example exim exim's experiencing
  33
  34 far few file files find finds fine fix fixed fly following for form found from
  35 fully
  36
  37 generate generated get gets getting given gives giving go goes going got
  38
  39 handle handles handled handling happen happens has have haven't having helpful
  40 him host hosts how however
  41
  42 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
  43
  44 jim just
  45
  46 keep keeps know knows
  47
  48 like line lines look looked looking lot
  49
  50 m machine machines machine's mail mails main make me mean means message messages
  51 might more much must my myself
  52
  53 near need neither no nor not now
  54
  55 occur of off often ok on one only or other our out over own
  56
  57 part parts particular per place possibility possible present problem problems
  58 put puts
  59
  60 quite
  61
  62 raised rather really reason rid right round run runs
  63
  64 same say saying see seeing seem seems seen sees set setting she should simply
  65 sit so some somehow something sometimes stand state statement still strange such
  66 supposed system systems
  67
  68 take takes tell than that the their them then there these they things think this
  69 those thought to try though to/for told too tried tries trying
  70
  71 under until up use uses used using usually
  72
  73 valid value values via
  74
  75 want wanted wanting was way we we've well what what's when where whereabouts
  76 whenever whether which while who whose why will with within without wish won't
  77 wondered work worked working works would wrong
  78
  79 xxx
  80
  81 yet yyy
  82
  83 ";
  84 ########################################################################
  85
  86
  87 # The regular expression fragment that defines the separator between words
  88
  89 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
  90
  91
  92 ########################################################################
  93 # Function to add to a length to accommodate HTML stuff
  94
  95 sub setlen{
  96 my($len, $s) = @_;
  97
  98 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
  99 $len += 1 while ($s =~ /&#\d+;/g);
 100
 101 return $len;
 102 }
 103
 104
 105 ########################################################################
 106 # Function to write out the list of initials with references
 107
 108 sub write_initials {
 109 my($this_initial) = "$_[0]";
 110
 111 print OUT "<p>\n&nbsp;&nbsp;";
 112
 113 foreach $initial (sort keys %initials)
 114   {
 115   if ($initial eq $this_initial)
 116     {
 117     print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
 118     }
 119   else
 120     {
 121     print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
 122     }
 123   }
 124
 125 print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
 126 }
 127
 128
 129
 130 ########################################################################
 131 # The main program. We can pick out the contents lines because they lie
 132 # between <li> and </li> in the file, sometimes on more than one physical
 133 # line.
 134
 135 # Turn the list of ignorable words into a hash for quick lookup. Add the
 136 # empty word to the list.
 137
 138 @words = split /\s+/, $ignore_list;
 139 foreach $word (@words) { $ignore{$word} = 1; }
 140 $ignore{""} = 1;
 141
 142
 143 # Open the file and do the job
 144
 145 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
 146
 147 while (<IN>)
 148   {
 149   next unless /^<li>/;
 150   $_ .= <IN> while !/<\/li>$/;
 151   chomp;
 152   s/\n\s*/ /g;
 153
 154   # Extract the operative text into $text, with the beginning in $pre.
 155
 156   my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;
 157
 158   # Now split into words. As well as punctuation, there may be HTML thingies
 159   # between words. Absorb them into the separators.
 160
 161   my(@words) = split /$wordgap/, $text;
 162
 163   # Lower case all the words, and remove those that we don't want.
 164   # Then keep a list of all the used initials.
 165
 166   REMOVE_IGNORE:
 167   for ($i = 0; $i < scalar @words; $i++)
 168     {
 169     my($word) = $words[$i] = "\L$words[$i]\E";
 170
 171     # Remove certain forms of word and those on the ignore list
 172
 173     if (defined $ignore{$word} ||  # word on ignore list
 174         $word =~ /^-+$/        ||  # word consists entirely of hyphens
 175         $word =~ /^-[^a-z]/    ||  # follows leading hyphen with non-letter
 176         $word =~ /^[^a-z-]/    ||  # starts with a non-letter or hyphen
 177         $word =~ /[@^.]/           # contains @ or ^ or .
 178        )
 179       {
 180       splice(@words, $i, 1);
 181       redo REMOVE_IGNORE if $i < scalar @words;
 182       }
 183
 184     # Otherwise, build up a list of initials
 185
 186     else
 187       {
 188       my($inword) = $word;
 189       $inword =~ s/^-//;
 190       $initial = substr($inword, 0, 1);
 191       $initials{"\U$initial\E"} = 1;
 192       }
 193     }
 194
 195   # Create the lines for the KWIC index, and store them in associative
 196   # arrays, with the keyword as the key. That will get them sorted
 197   # automatically.
 198
 199   while (scalar @words > 0)
 200     {
 201     my($word) = shift @words;
 202     my($pretext, $casedword, $posttext) =
 203       $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
 204
 205     # Remove a leading hyphen from $word so that it sorts according to
 206     # the leading letter. What is actually output is $casedword, which
 207     # retains the hyphen.
 208
 209     $word =~ s/^-//;
 210
 211     my($prelen) = length $pretext;
 212     my($postlen) = length $posttext;
 213
 214     # We want to chop excessively long entries on either side. We can't set
 215     # a fixed length because of the HTML control data. Call a function to
 216     # add the given length to allow for HTML stuff. This is crude, but it
 217     # does roughtly the right thing.
 218
 219     my($leftlen) = &setlen(70, $pretext);
 220     my($rightlen) = &setlen(70, $posttext);
 221
 222     if ($prelen > $leftlen)
 223       {
 224       my($cutoff) = $leftlen;
 225       $cutoff++
 226         while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
 227       $pretext = "... " . substr($pretext, -$cutoff);
 228       }
 229
 230     if ($postlen > $rightlen)
 231       {
 232       my($cutoff) = $rightlen;
 233       $cutoff++
 234         while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
 235       $posttext = substr($posttext, 0, $cutoff) . "...";
 236       }
 237
 238     # If the pre text has a font-ending not preceded by a font beginning
 239     # (i.e. we've chopped the beginning off), we must insert a beginning.
 240
 241     while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
 242       {
 243       $pretext = "<$2>" . $pretext;
 244       }
 245
 246     # If the pre text ends in a special font, we have to terminate that,
 247     # and reset it at the start of the post text.
 248
 249     my($poststart) = "";
 250
 251     while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
 252       {
 253       $pretext .= "</$1>";
 254       $poststart .= "<$1>";
 255       }
 256
 257     # If the post text changes font but doesn't close it, we must add
 258     # the closure.
 259
 260     while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
 261       {
 262       $posttext .= "</$1>";
 263       }
 264
 265     # Remove any unnecessary changes in either of them
 266
 267     $pretext  =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
 268     $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
 269
 270     # Save the texts in associative arrays. Add the question number to
 271     # the end of the word to make the key.
 272
 273     $pre =~ /(Q\d\d\d\d)/;
 274     my($key) = "$word-$1";
 275
 276     $tableft{$key}  = $pre . $pretext;
 277     $tabright{$key} = $poststart .
 278       "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
 279     }
 280   }
 281
 282 close(IN);
 283
 284 # Now write out the files. Each letter in the index goes in a different file
 285
 286 $current_initial = "";
 287
 288 foreach $key (sort keys %tableft)
 289   {
 290   my($initial) = $key =~ /^(.)/;
 291   $initial = "\U$initial\E";
 292
 293   if ($initial ne $current_initial)
 294     {
 295     if ($current_initial ne "")
 296       {
 297       print OUT "</table>\n";
 298       &write_initials($current_initial);
 299       print OUT "</body>\n</html>\n";
 300       close OUT;
 301       }
 302
 303     open (OUT, ">html/FAQ-KWIC_$initial.html") ||
 304       die "Can't open html/FAQ-KWIC_$initial.html\n";
 305     print OUT
 306       "<html>\n" .
 307       "<head>\n" .
 308       "<title>Exim FAQ: KWIC index section $initial</title>\n" .
 309       "</head>\n" .
 310       "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
 311       "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
 312
 313     write_initials($initial);
 314
 315     if ($initial eq "A")
 316       {
 317       print OUT <<End ;
 318 <p>
 319 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
 320 automatically from the FAQ source. Browsers may not display the data very
 321 prettily, but it is hoped that it may provide a useful aid for finding things
 322 in the FAQ.
 323 </p>
 324 End
 325       }
 326
 327     print OUT "<table border>\n";
 328     $current_initial = $initial;
 329     }
 330
 331   print OUT "<tr>\n";
 332   print OUT "<td align=\"right\">$tableft{$key}</td>\n";
 333   print OUT "<td align=\"left\">$tabright{$key}</td>\n";
 334   print OUT "</tr>\n";
 335   }
 336
 337 # Close the final file
 338
 339 if ($current_initial ne "")
 340   {
 341   print OUT "</table>\n";
 342   &write_initials($current_initial);
 343   print OUT "</body>\n</html>\n";
 344   close OUT;
 345   }
 346
 347 # End