[exim.git] / doc / doc-scripts / fc2k

#! /usr/bin/perl -w
# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.2 2004/10/14 09:53:11 ph10 Exp $

# Script to read the HTML table of contents for the Exim FAQ and create an
# HTML KWIC index out of it.


########################################################################
# List of words to ignore - kept alphabetically for reference, but they
# don't have to be in order.

$ignore_list = "

a ability able about absence access according actual address addresses addressed
affect affected after against aka all allow allowed allows along already also
although always am amount an ancient and and/or annoying another any anybody
anyone anything anywhere apparent apparently are aren't around arrange arrive
arrives as at

back bad based basically be because been behave behaviour being best between
bob both box bug build builds built busy but by

call called calls can can't cannot causes causing central certain code comes
coming command commands complain complaining complains configure configured
conjunction contact contain contains contained correct correctly could
currently customer

day days defined deliver delivers delivered delivery deliveries did do does
doesn't doing don't down during

e-mail e-mails each easy either else email emails entirely entries entry
especially etc even ever every example exim exim's experiencing

far few file files find finds fine fix fixed fly following for form found from
fully

generate generated get gets getting given gives giving go goes going got

handle handles handled handling happen happens has have haven't having helpful
him host hosts how however

i i'd i'm i've if in indeed instead into is issue issues isn't it it's its

jim just

keep keeps know knows

like line lines look looked looking lot

m machine machines machine's mail mails main make me mean means message messages
might more much must my myself

near need neither no nor not now

occur of off often ok on one only or other our out over own

part parts particular per place possibility possible present problem problems
put puts

quite

raised rather really reason rid right round run runs

same say saying see seeing seem seems seen sees set setting she should simply
sit so some somehow something sometimes stand state statement still strange such
supposed system systems

take takes tell than that the their them then there these they things think this
those thought to try though to/for told too tried tries trying

under until up use uses used using usually

valid value values via

want wanted wanting was way we we've well what what's when where whereabouts
whenever whether which while who whose why will with within without wish won't
wondered work worked working works would wrong

xxx

yet yyy

";
########################################################################


# The regular expression fragment that defines the separator between words

$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";


########################################################################
# Function to add to a length to accommodate HTML stuff

sub setlen{
my($len, $s) = @_;

$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
$len += 1 while ($s =~ /&#\d+;/g);

return $len;
}


########################################################################
# Function to write out the list of initials with references

sub write_initials {
my($this_initial) = "$_[0]";

print OUT "<p>\n&nbsp;&nbsp;";

foreach $initial (sort keys %initials)
  {
  if ($initial eq $this_initial)
    {
    print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
    }
  else
    {
    print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
    }
  }

print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
}


########################################################################
# The main program. We can pick out the contents lines because they lie
# between <li> and </li> in the file, sometimes on more than one physical
# line.

# Turn the list of ignorable words into a hash for quick lookup. Add the
# empty word to the list.

@words = split /\s+/, $ignore_list;
foreach $word (@words) { $ignore{$word} = 1; }
$ignore{""} = 1;


# Open the file and do the job

open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";

while (<IN>)
  {
  next unless /^<li>/;
  $_ .= <IN> while !/<\/li>$/;
  chomp;
  s/\n\s*/ /g;

  # Extract the operative text into $text, with the beginning in $pre.

  my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;

  # Now split into words. As well as punctuation, there may be HTML thingies
  # between words. Absorb them into the separators.

  my(@words) = split /$wordgap/, $text;

  # Lower case all the words, and remove those that we don't want.
  # Then keep a list of all the used initials.

  REMOVE_IGNORE:
  for ($i = 0; $i < scalar @words; $i++)
    {
    my($word) = $words[$i] = "\L$words[$i]\E";

    # Remove certain forms of word and those on the ignore list

    if (defined $ignore{$word} ||  # word on ignore list
        $word =~ /^-+$/        ||  # word consists entirely of hyphens
        $word =~ /^-[^a-z]/    ||  # follows leading hyphen with non-letter
        $word =~ /^[^a-z-]/    ||  # starts with a non-letter or hyphen
        $word =~ /[@^.]/           # contains @ or ^ or .
       )
      {
      splice(@words, $i, 1);
      redo REMOVE_IGNORE if $i < scalar @words;
      }
      
    # Otherwise, build up a list of initials
     
    else
      {
      my($inword) = $word; 
      $inword =~ s/^-//; 
      $initial = substr($inword, 0, 1);
      $initials{"\U$initial\E"} = 1;
      }
    }

  # Create the lines for the KWIC index, and store them in associative
  # arrays, with the keyword as the key. That will get them sorted
  # automatically.

  while (scalar @words > 0)
    {
    my($word) = shift @words;
    my($pretext, $casedword, $posttext) =
      $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
      
    # Remove a leading hyphen from $word so that it sorts according to
    # the leading letter. What is actually output is $casedword, which
    # retains the hyphen.
    
    $word =~ s/^-//;   

    my($prelen) = length $pretext;
    my($postlen) = length $posttext;

    # We want to chop excessively long entries on either side. We can't set
    # a fixed length because of the HTML control data. Call a function to
    # add the given length to allow for HTML stuff. This is crude, but it
    # does roughtly the right thing.

    my($leftlen) = &setlen(70, $pretext);
    my($rightlen) = &setlen(70, $posttext);

    if ($prelen > $leftlen)
      {
      my($cutoff) = $leftlen;
      $cutoff++
        while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
      $pretext = "... " . substr($pretext, -$cutoff);
      }

    if ($postlen > $rightlen)
      {
      my($cutoff) = $rightlen;
      $cutoff++
        while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
      $posttext = substr($posttext, 0, $cutoff) . "...";
      }

    # If the pre text has a font-ending not preceded by a font beginning
    # (i.e. we've chopped the beginning off), we must insert a beginning.

    while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
      {
      $pretext = "<$2>" . $pretext;
      }

    # If the pre text ends in a special font, we have to terminate that,
    # and reset it at the start of the post text.

    my($poststart) = "";

    while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
      {
      $pretext .= "</$1>";
      $poststart .= "<$1>";
      }

    # If the post text changes font but doesn't close it, we must add
    # the closure.

    while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
      {
      $posttext .= "</$1>";
      }

    # Remove any unnecessary changes in either of them

    $pretext  =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
    $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;

    # Save the texts in associative arrays. Add the question number to
    # the end of the word to make the key.

    $pre =~ /(Q\d\d\d\d)/;
    my($key) = "$word-$1";

    $tableft{$key}  = $pre . $pretext;
    $tabright{$key} = $poststart .
      "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
    }
  }

close(IN);

# Now write out the files. Each letter in the index goes in a different file

$current_initial = "";

foreach $key (sort keys %tableft)
  {
  my($initial) = $key =~ /^(.)/;
  $initial = "\U$initial\E";

  if ($initial ne $current_initial)
    {
    if ($current_initial ne "")
      {
      print OUT "</table>\n";
      &write_initials($current_initial);
      print OUT "</body>\n</html>\n";
      close OUT;
      }

    open (OUT, ">html/FAQ-KWIC_$initial.html") ||
      die "Can't open html/FAQ-KWIC_$initial.html\n";
    print OUT
      "<html>\n" .
      "<head>\n" .
      "<title>Exim FAQ: KWIC index section $initial</title>\n" .
      "</head>\n" .
      "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
      "<h1>Exim FAQ: Keyword-in-context index</h1>\n";

    write_initials($initial);

    if ($initial eq "A")
      {
      print OUT <<End ;
<p>
This <i>Keyword-in-context</i> index for the Exim FAQ is generated
automatically from the FAQ source. Browsers may not display the data very
prettily, but it is hoped that it may provide a useful aid for finding things
in the FAQ.
</p>
End
      }

    print OUT "<table border>\n";
    $current_initial = $initial;
    }

  print OUT "<tr>\n";
  print OUT "<td align=\"right\">$tableft{$key}</td>\n";
  print OUT "<td align=\"left\">$tabright{$key}</td>\n";
  print OUT "</tr>\n";
  }

# Close the final file

if ($current_initial ne "")
  {
  print OUT "</table>\n";
  &write_initials($current_initial);
  print OUT "</body>\n</html>\n";
  close OUT;
  }

# End
Commit	Line	Data
495ae4b0	1	#! /usr/bin/perl -w
8e26e4bf	2	# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.2 2004/10/14 09:53:11 ph10 Exp $
495ae4b0 PH	3
	4	# Script to read the HTML table of contents for the Exim FAQ and create an
	5	# HTML KWIC index out of it.
	6
	7
	8	########################################################################
	9	# List of words to ignore - kept alphabetically for reference, but they
	10	# don't have to be in order.
	11
	12	$ignore_list = "
	13
8e26e4bf PH	14	a ability able about absence access according actual address addresses addressed
	15	affect affected after against aka all allow allowed allows along already also
	16	although always am amount an ancient and and/or annoying another any anybody
	17	anyone anything anywhere apparent apparently are aren't around arrange arrive
	18	arrives as at
495ae4b0 PH	19
495ae4b0 PH	20	back bad based basically be because been behave behaviour being best between
8e26e4bf	21	bob both box bug build builds built busy but by
495ae4b0 PH	22
	23	call called calls can can't cannot causes causing central certain code comes
	24	coming command commands complain complaining complains configure configured
	25	conjunction contact contain contains contained correct correctly could
	26	currently customer
	27
	28	day days defined deliver delivers delivered delivery deliveries did do does
	29	doesn't doing don't down during
	30
8e26e4bf PH	31	e-mail e-mails each easy either else email emails entirely entries entry
8e26e4bf PH	32	especially etc even ever every example exim exim's experiencing
495ae4b0	33
8e26e4bf PH	34	far few file files find finds fine fix fixed fly following for form found from
8e26e4bf PH	35	fully
495ae4b0	36
8e26e4bf	37	generate generated get gets getting given gives giving go goes going got
495ae4b0 PH	38
	39	handle handles handled handling happen happens has have haven't having helpful
	40	him host hosts how however
	41
	42	i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
	43
	44	jim just
	45
	46	keep keeps know knows
	47
	48	like line lines look looked looking lot
	49
8e26e4bf PH	50	m machine machines machine's mail mails main make me mean means message messages
8e26e4bf PH	51	might more much must my myself
495ae4b0 PH	52
	53	near need neither no nor not now
	54
	55	occur of off often ok on one only or other our out over own
	56
	57	part parts particular per place possibility possible present problem problems
	58	put puts
	59
	60	quite
	61
	62	raised rather really reason rid right round run runs
	63
8e26e4bf PH	64	same say saying see seeing seem seems seen sees set setting she should simply
	65	sit so some somehow something sometimes stand state statement still strange such
	66	supposed system systems
495ae4b0	67
8e26e4bf PH	68	take takes tell than that the their them then there these they things think this
8e26e4bf PH	69	those thought to try though to/for told too tried tries trying
495ae4b0 PH	70
	71	under until up use uses used using usually
	72
	73	valid value values via
	74
	75	want wanted wanting was way we we've well what what's when where whereabouts
	76	whenever whether which while who whose why will with within without wish won't
	77	wondered work worked working works would wrong
	78
	79	xxx
	80
	81	yet yyy
	82
	83	";
	84	########################################################################
	85
	86
	87	# The regular expression fragment that defines the separator between words
	88
	89	$wordgap = "(?:[]().?,;:\"']\|(?><[^>]>))(?:\\s+\|\$)(?:[[(\"'`]\|(?><[^>]>))";
	90
	91
	92	########################################################################
	93	# Function to add to a length to accommodate HTML stuff
	94
	95	sub setlen{
	96	my($len, $s) = @_;
	97
	98	$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
	99	$len += 1 while ($s =~ /&#\d+;/g);
	100
	101	return $len;
	102	}
	103
	104
	105	########################################################################
	106	# Function to write out the list of initials with references
	107
	108	sub write_initials {
	109	my($this_initial) = "$_[0]";
	110
	111	print OUT "<p>\n  ";
	112
	113	foreach $initial (sort keys %initials)
	114	{
	115	if ($initial eq $this_initial)
	116	{
	117	print OUT "   <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
	118	}
	119	else
	120	{
	121	print OUT "<a href=\"FAQ-KWIC_$initial.html\">  $initial</a>";
	122	}
	123	}
	124
	125	print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
	126	}
	127
	128
	129
	130	########################################################################
	131	# The main program. We can pick out the contents lines because they lie
	132	# between <li> and </li> in the file, sometimes on more than one physical
	133	# line.
134
135	# Turn the list of ignorable words into a hash for quick lookup. Add the
136	# empty word to the list.
137
138	@words = split /\s+/, $ignore_list;
139	foreach $word (@words) { $ignore{$word} = 1; }
140	$ignore{""} = 1;
141
142
143	# Open the file and do the job
144
145	open(IN, "html/FAQ.html") \|\| die "Can't open html/FAQ.html\n";
146
147	while (<IN>)
148	{
149	next unless /^<li>/;
150	$_ .= <IN> while !/<\/li>$/;
151	chomp;
152	s/\n\s*/ /g;
153
154	# Extract the operative text into $text, with the beginning in $pre.
155
156	my($pre,$text,$post) = /^<li>(.<\/a>:(?: ))(.*)<br><br><\/li>$/;
157
158	# Now split into words. As well as punctuation, there may be HTML thingies
159	# between words. Absorb them into the separators.
160
161	my(@words) = split /$wordgap/, $text;
162
163	# Lower case all the words, and remove those that we don't want.
164	# Then keep a list of all the used initials.
165
166	REMOVE_IGNORE:
167	for ($i = 0; $i < scalar @words; $i++)
168	{
169	my($word) = $words[$i] = "\L$words[$i]\E";
170
171	# Remove certain forms of word and those on the ignore list
172
173	if (defined $ignore{$word} \|\| # word on ignore list
174	$word =~ /^-+$/ \|\| # word consists entirely of hyphens
175	$word =~ /^-[^a-z]/ \|\| # follows leading hyphen with non-letter
176	$word =~ /^[^a-z-]/ \|\| # starts with a non-letter or hyphen
177	$word =~ /[@^.]/ # contains @ or ^ or .
178	)
179	{
180	splice(@words, $i, 1);
181	redo REMOVE_IGNORE if $i < scalar @words;
182	}
183
184	# Otherwise, build up a list of initials
185
186	else
187	{
188	my($inword) = $word;
189	$inword =~ s/^-//;
190	$initial = substr($inword, 0, 1);
191	$initials{"\U$initial\E"} = 1;
192	}
193	}
194
195	# Create the lines for the KWIC index, and store them in associative
196	# arrays, with the keyword as the key. That will get them sorted
197	# automatically.
198
199	while (scalar @words > 0)
200	{
201	my($word) = shift @words;
202	my($pretext, $casedword, $posttext) =
203	$text =~ /(.?)(?<![a-z])(\Q$word\E)(?![a-z])(.)/i;
204
205	# Remove a leading hyphen from $word so that it sorts according to
206	# the leading letter. What is actually output is $casedword, which
207	# retains the hyphen.
208
209	$word =~ s/^-//;
210
211	my($prelen) = length $pretext;
212	my($postlen) = length $posttext;
213
214	# We want to chop excessively long entries on either side. We can't set
215	# a fixed length because of the HTML control data. Call a function to
216	# add the given length to allow for HTML stuff. This is crude, but it
217	# does roughtly the right thing.
218
219	my($leftlen) = &setlen(70, $pretext);
220	my($rightlen) = &setlen(70, $posttext);
221
222	if ($prelen > $leftlen)
223	{
224	my($cutoff) = $leftlen;
225	$cutoff++
226	while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
227	$pretext = "... " . substr($pretext, -$cutoff);
228	}
229
230	if ($postlen > $rightlen)
231	{
232	my($cutoff) = $rightlen;
233	$cutoff++
234	while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
235	$posttext = substr($posttext, 0, $cutoff) . "...";
236	}
237
238	# If the pre text has a font-ending not preceded by a font beginning
239	# (i.e. we've chopped the beginning off), we must insert a beginning.
240
241	while ($pretext =~ /^(.*?)<\/(small\|tt\|b\|i)>/ && $1 !~ /<$2>/)
242	{
243	$pretext = "<$2>" . $pretext;
244	}
245
246	# If the pre text ends in a special font, we have to terminate that,
247	# and reset it at the start of the post text.
248
249	my($poststart) = "";
250
251	while ($pretext =~ /<(small\|tt\|b\|i)>(?!.*?<\/\1>)/)
252	{
253	$pretext .= "</$1>";
254	$poststart .= "<$1>";
255	}
256
257	# If the post text changes font but doesn't close it, we must add
258	# the closure.
259
260	while ($posttext =~ /<(small\|tt\|b\|i)>(?!.*?<\/\1>)/)
261	{
262	$posttext .= "</$1>";
263	}
264
265	# Remove any unnecessary changes in either of them
266
267	$pretext =~ s/<(small\|tt\|b\|i)>\s*<\/\1>//g;
268	$posttext =~ s/<(small\|tt\|b\|i)>\s*<\/\1>//g;
269
270	# Save the texts in associative arrays. Add the question number to
271	# the end of the word to make the key.
272
273	$pre =~ /(Q\d\d\d\d)/;
274	my($key) = "$word-$1";
275
276	$tableft{$key} = $pre . $pretext;
277	$tabright{$key} = $poststart .
278	"<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
279	}
280	}
281
282	close(IN);
283
284	# Now write out the files. Each letter in the index goes in a different file
285
286	$current_initial = "";
287
288	foreach $key (sort keys %tableft)
289	{
290	my($initial) = $key =~ /^(.)/;
291	$initial = "\U$initial\E";
292
293	if ($initial ne $current_initial)
294	{
295	if ($current_initial ne "")
296	{
297	print OUT "</table>\n";
298	&write_initials($current_initial);
299	print OUT "</body>\n</html>\n";
300	close OUT;
301	}
302
303	open (OUT, ">html/FAQ-KWIC_$initial.html") \|\|
304	die "Can't open html/FAQ-KWIC_$initial.html\n";
305	print OUT
306	"<html>\n" .
307	"<head>\n" .
308	"<title>Exim FAQ: KWIC index section $initial</title>\n" .
309	"</head>\n" .
310	"<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
311	"<h1>Exim FAQ: Keyword-in-context index</h1>\n";
312
313	write_initials($initial);
314
315	if ($initial eq "A")
316	{
317	print OUT <<End ;
318	<p>
319	This <i>Keyword-in-context</i> index for the Exim FAQ is generated
320	automatically from the FAQ source. Browsers may not display the data very
321	prettily, but it is hoped that it may provide a useful aid for finding things
322	in the FAQ.
323	</p>
324	End
325	}
326
327	print OUT "<table border>\n";
328	$current_initial = $initial;
329	}
330
331	print OUT "<tr>\n";
332	print OUT "<td align=\"right\">$tableft{$key}</td>\n";
333	print OUT "<td align=\"left\">$tabright{$key}</td>\n";
334	print OUT "</tr>\n";
335	}
336
337	# Close the final file
338
339	if ($current_initial ne "")
340	{
341	print OUT "</table>\n";
342	&write_initials($current_initial);
343	print OUT "</body>\n</html>\n";
344	close OUT;
345	}
346
347	# End