[exim.git] / doc / doc-scripts / fc2k

#! /usr/bin/perl -w

# Script to read the HTML table of contents for the Exim FAQ and create an
# HTML KWIC index out of it.


########################################################################
# List of words to ignore - kept alphabetically for reference, but they
# don't have to be in order.

$ignore_list = "

a ability able about absence access according actual address addresses addressed
affect affected after against aka all allow allowed allows along already also
although always am amount an ancient and and/or annoying another any anybody
anyone anything anywhere apparent apparently are aren't around arrange arrive
arrives as at

back bad based basically be because been behave behaviour being best between
bob both box bug build builds built busy but by

call called calls can can't cannot causes causing central certain code comes
coming command commands complain complaining complains configure configured
conjunction contact contain contains contained correct correctly could
currently customer

day days defined deliver delivers delivered delivery deliveries did do does
doesn't doing don't down during

e-mail e-mails each easy either else email emails entirely entries entry
especially etc even ever every example exim exim's experiencing

far few file files find finds fine fix fixed fly following for form found from
fully

generate generated get gets getting given gives giving go goes going got

handle handles handled handling happen happens has have haven't having helpful
him host hosts how however

i i'd i'm i've if in indeed instead into is issue issues isn't it it's its

jim just

keep keeps know knows

like line lines look looked looking lot

m machine machines machine's mail mails main make me mean means message messages
might more much must my myself

near need neither no nor not now

occur of off often ok on one only or other our out over own

part parts particular per place possibility possible present problem problems
put puts

quite

raised rather really reason rid right round run runs

same say saying see seeing seem seems seen sees set setting she should simply
sit so some somehow something sometimes stand state statement still strange such
supposed system systems

take takes tell than that the their them then there these they things think this
those thought to try though to/for told too tried tries trying

under until up use uses used using usually

valid value values via

want wanted wanting was way we we've well what what's when where whereabouts
whenever whether which while who whose why will with within without wish won't
wondered work worked working works would wrong

xxx

yet yyy

";
########################################################################


# The regular expression fragment that defines the separator between words

$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";


########################################################################
# Function to add to a length to accommodate HTML stuff

sub setlen{
my($len, $s) = @_;

$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
$len += 1 while ($s =~ /&#\d+;/g);

return $len;
}


########################################################################
# Function to write out the list of initials with references

sub write_initials {
my($this_initial) = "$_[0]";

print OUT "<p>\n&nbsp;&nbsp;";

foreach $initial (sort keys %initials)
  {
  if ($initial eq $this_initial)
    {
    print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
    }
  else
    {
    print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
    }
  }

print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
}


########################################################################
# The main program. We can pick out the contents lines because they lie
# between <li> and </li> in the file, sometimes on more than one physical
# line.

# Turn the list of ignorable words into a hash for quick lookup. Add the
# empty word to the list.

@words = split /\s+/, $ignore_list;
foreach $word (@words) { $ignore{$word} = 1; }
$ignore{""} = 1;


# Open the file and do the job

open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";

while (<IN>)
  {
  next unless /^<li>/;
  $_ .= <IN> while !/<\/li>$/;
  chomp;
  s/\n\s*/ /g;

  # Extract the operative text into $text, with the beginning in $pre.

  my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;

  # Now split into words. As well as punctuation, there may be HTML thingies
  # between words. Absorb them into the separators.

  my(@words) = split /$wordgap/, $text;

  # Lower case all the words, and remove those that we don't want.
  # Then keep a list of all the used initials.

  REMOVE_IGNORE:
  for ($i = 0; $i < scalar @words; $i++)
    {
    my($word) = $words[$i] = "\L$words[$i]\E";

    # Remove certain forms of word and those on the ignore list

    if (defined $ignore{$word} ||  # word on ignore list
        $word =~ /^-+$/        ||  # word consists entirely of hyphens
        $word =~ /^-[^a-z]/    ||  # follows leading hyphen with non-letter
        $word =~ /^[^a-z-]/    ||  # starts with a non-letter or hyphen
        $word =~ /[@^.]/           # contains @ or ^ or .
       )
      {
      splice(@words, $i, 1);
      redo REMOVE_IGNORE if $i < scalar @words;
      }
      
    # Otherwise, build up a list of initials
     
    else
      {
      my($inword) = $word; 
      $inword =~ s/^-//; 
      $initial = substr($inword, 0, 1);
      $initials{"\U$initial\E"} = 1;
      }
    }

  # Create the lines for the KWIC index, and store them in associative
  # arrays, with the keyword as the key. That will get them sorted
  # automatically.

  while (scalar @words > 0)
    {
    my($word) = shift @words;
    my($pretext, $casedword, $posttext) =
      $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
      
    # Remove a leading hyphen from $word so that it sorts according to
    # the leading letter. What is actually output is $casedword, which
    # retains the hyphen.
    
    $word =~ s/^-//;   

    my($prelen) = length $pretext;
    my($postlen) = length $posttext;

    # We want to chop excessively long entries on either side. We can't set
    # a fixed length because of the HTML control data. Call a function to
    # add the given length to allow for HTML stuff. This is crude, but it
    # does roughtly the right thing.

    my($leftlen) = &setlen(70, $pretext);
    my($rightlen) = &setlen(70, $posttext);

    if ($prelen > $leftlen)
      {
      my($cutoff) = $leftlen;
      $cutoff++
        while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
      $pretext = "... " . substr($pretext, -$cutoff);
      }

    if ($postlen > $rightlen)
      {
      my($cutoff) = $rightlen;
      $cutoff++
        while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
      $posttext = substr($posttext, 0, $cutoff) . "...";
      }

    # If the pre text has a font-ending not preceded by a font beginning
    # (i.e. we've chopped the beginning off), we must insert a beginning.

    while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
      {
      $pretext = "<$2>" . $pretext;
      }

    # If the pre text ends in a special font, we have to terminate that,
    # and reset it at the start of the post text.

    my($poststart) = "";

    while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
      {
      $pretext .= "</$1>";
      $poststart .= "<$1>";
      }

    # If the post text changes font but doesn't close it, we must add
    # the closure.

    while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
      {
      $posttext .= "</$1>";
      }

    # Remove any unnecessary changes in either of them

    $pretext  =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
    $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;

    # Save the texts in associative arrays. Add the question number to
    # the end of the word to make the key.

    $pre =~ /(Q\d\d\d\d)/;
    my($key) = "$word-$1";

    $tableft{$key}  = $pre . $pretext;
    $tabright{$key} = $poststart .
      "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
    }
  }

close(IN);

# Now write out the files. Each letter in the index goes in a different file

$current_initial = "";

foreach $key (sort keys %tableft)
  {
  my($initial) = $key =~ /^(.)/;
  $initial = "\U$initial\E";

  if ($initial ne $current_initial)
    {
    if ($current_initial ne "")
      {
      print OUT "</table>\n";
      &write_initials($current_initial);
      print OUT "</body>\n</html>\n";
      close OUT;
      }

    open (OUT, ">html/FAQ-KWIC_$initial.html") ||
      die "Can't open html/FAQ-KWIC_$initial.html\n";
    print OUT
      "<html>\n" .
      "<head>\n" .
      "<title>Exim FAQ: KWIC index section $initial</title>\n" .
      "</head>\n" .
      "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
      "<h1>Exim FAQ: Keyword-in-context index</h1>\n";

    write_initials($initial);

    if ($initial eq "A")
      {
      print OUT <<End ;
<p>
This <i>Keyword-in-context</i> index for the Exim FAQ is generated
automatically from the FAQ source. Browsers may not display the data very
prettily, but it is hoped that it may provide a useful aid for finding things
in the FAQ.
</p>
End
      }

    print OUT "<table border>\n";
    $current_initial = $initial;
    }

  print OUT "<tr>\n";
  print OUT "<td align=\"right\">$tableft{$key}</td>\n";
  print OUT "<td align=\"left\">$tabright{$key}</td>\n";
  print OUT "</tr>\n";
  }

# Close the final file

if ($current_initial ne "")
  {
  print OUT "</table>\n";
  &write_initials($current_initial);
  print OUT "</body>\n</html>\n";
  close OUT;
  }

# End
Commit	Line	Data
	1	#! /usr/bin/perl -w
	2
	3	# Script to read the HTML table of contents for the Exim FAQ and create an
	4	# HTML KWIC index out of it.
	5
	6
	7	########################################################################
	8	# List of words to ignore - kept alphabetically for reference, but they
	9	# don't have to be in order.
	10
	11	$ignore_list = "
	12
	13	a ability able about absence access according actual address addresses addressed
	14	affect affected after against aka all allow allowed allows along already also
	15	although always am amount an ancient and and/or annoying another any anybody
	16	anyone anything anywhere apparent apparently are aren't around arrange arrive
	17	arrives as at
	18
	19	back bad based basically be because been behave behaviour being best between
	20	bob both box bug build builds built busy but by
	21
	22	call called calls can can't cannot causes causing central certain code comes
	23	coming command commands complain complaining complains configure configured
	24	conjunction contact contain contains contained correct correctly could
	25	currently customer
	26
	27	day days defined deliver delivers delivered delivery deliveries did do does
	28	doesn't doing don't down during
	29
	30	e-mail e-mails each easy either else email emails entirely entries entry
	31	especially etc even ever every example exim exim's experiencing
	32
	33	far few file files find finds fine fix fixed fly following for form found from
	34	fully
	35
	36	generate generated get gets getting given gives giving go goes going got
	37
	38	handle handles handled handling happen happens has have haven't having helpful
	39	him host hosts how however
	40
	41	i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
	42
	43	jim just
	44
	45	keep keeps know knows
	46
	47	like line lines look looked looking lot
	48
	49	m machine machines machine's mail mails main make me mean means message messages
	50	might more much must my myself
	51
	52	near need neither no nor not now
	53
	54	occur of off often ok on one only or other our out over own
	55
	56	part parts particular per place possibility possible present problem problems
	57	put puts
	58
	59	quite
	60
	61	raised rather really reason rid right round run runs
	62
	63	same say saying see seeing seem seems seen sees set setting she should simply
	64	sit so some somehow something sometimes stand state statement still strange such
	65	supposed system systems
	66
	67	take takes tell than that the their them then there these they things think this
	68	those thought to try though to/for told too tried tries trying
	69
	70	under until up use uses used using usually
	71
	72	valid value values via
	73
	74	want wanted wanting was way we we've well what what's when where whereabouts
	75	whenever whether which while who whose why will with within without wish won't
	76	wondered work worked working works would wrong
	77
	78	xxx
	79
	80	yet yyy
	81
	82	";
	83	########################################################################
	84
	85
	86	# The regular expression fragment that defines the separator between words
	87
	88	$wordgap = "(?:[]().?,;:\"']\|(?><[^>]>))(?:\\s+\|\$)(?:[[(\"'`]\|(?><[^>]>))";
	89
	90
	91	########################################################################
	92	# Function to add to a length to accommodate HTML stuff
	93
	94	sub setlen{
	95	my($len, $s) = @_;
	96
	97	$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
	98	$len += 1 while ($s =~ /&#\d+;/g);
	99
	100	return $len;
	101	}
	102
	103
	104	########################################################################
	105	# Function to write out the list of initials with references
	106
	107	sub write_initials {
	108	my($this_initial) = "$_[0]";
	109
	110	print OUT "<p>\n  ";
	111
	112	foreach $initial (sort keys %initials)
	113	{
	114	if ($initial eq $this_initial)
	115	{
	116	print OUT "   <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
	117	}
	118	else
	119	{
	120	print OUT "<a href=\"FAQ-KWIC_$initial.html\">  $initial</a>";
	121	}
	122	}
	123
	124	print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
	125	}
	126
	127
	128
	129	########################################################################
	130	# The main program. We can pick out the contents lines because they lie
	131	# between <li> and </li> in the file, sometimes on more than one physical
	132	# line.
	133
	134	# Turn the list of ignorable words into a hash for quick lookup. Add the
	135	# empty word to the list.
	136
	137	@words = split /\s+/, $ignore_list;
	138	foreach $word (@words) { $ignore{$word} = 1; }
	139	$ignore{""} = 1;
	140
	141
	142	# Open the file and do the job
	143
	144	open(IN, "html/FAQ.html") \|\| die "Can't open html/FAQ.html\n";
	145
	146	while (<IN>)
	147	{
	148	next unless /^<li>/;
	149	$_ .= <IN> while !/<\/li>$/;
	150	chomp;
	151	s/\n\s*/ /g;
	152
	153	# Extract the operative text into $text, with the beginning in $pre.
	154
	155	my($pre,$text,$post) = /^<li>(.<\/a>:(?: ))(.*)<br><br><\/li>$/;
	156
	157	# Now split into words. As well as punctuation, there may be HTML thingies
	158	# between words. Absorb them into the separators.
	159
	160	my(@words) = split /$wordgap/, $text;
	161
	162	# Lower case all the words, and remove those that we don't want.
	163	# Then keep a list of all the used initials.
	164
	165	REMOVE_IGNORE:
	166	for ($i = 0; $i < scalar @words; $i++)
	167	{
	168	my($word) = $words[$i] = "\L$words[$i]\E";
	169
	170	# Remove certain forms of word and those on the ignore list
	171
	172	if (defined $ignore{$word} \|\| # word on ignore list
	173	$word =~ /^-+$/ \|\| # word consists entirely of hyphens
	174	$word =~ /^-[^a-z]/ \|\| # follows leading hyphen with non-letter
	175	$word =~ /^[^a-z-]/ \|\| # starts with a non-letter or hyphen
	176	$word =~ /[@^.]/ # contains @ or ^ or .
	177	)
	178	{
	179	splice(@words, $i, 1);
	180	redo REMOVE_IGNORE if $i < scalar @words;
	181	}
	182
	183	# Otherwise, build up a list of initials
	184
	185	else
	186	{
	187	my($inword) = $word;
	188	$inword =~ s/^-//;
	189	$initial = substr($inword, 0, 1);
	190	$initials{"\U$initial\E"} = 1;
	191	}
	192	}
	193
	194	# Create the lines for the KWIC index, and store them in associative
	195	# arrays, with the keyword as the key. That will get them sorted
	196	# automatically.
	197
	198	while (scalar @words > 0)
	199	{
	200	my($word) = shift @words;
	201	my($pretext, $casedword, $posttext) =
	202	$text =~ /(.?)(?<![a-z])(\Q$word\E)(?![a-z])(.)/i;
	203
	204	# Remove a leading hyphen from $word so that it sorts according to
	205	# the leading letter. What is actually output is $casedword, which
	206	# retains the hyphen.
	207
	208	$word =~ s/^-//;
	209
	210	my($prelen) = length $pretext;
	211	my($postlen) = length $posttext;
	212
	213	# We want to chop excessively long entries on either side. We can't set
	214	# a fixed length because of the HTML control data. Call a function to
	215	# add the given length to allow for HTML stuff. This is crude, but it
	216	# does roughtly the right thing.
	217
	218	my($leftlen) = &setlen(70, $pretext);
	219	my($rightlen) = &setlen(70, $posttext);
	220
	221	if ($prelen > $leftlen)
	222	{
	223	my($cutoff) = $leftlen;
	224	$cutoff++
	225	while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
	226	$pretext = "... " . substr($pretext, -$cutoff);
	227	}
	228
	229	if ($postlen > $rightlen)
	230	{
	231	my($cutoff) = $rightlen;
	232	$cutoff++
	233	while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
	234	$posttext = substr($posttext, 0, $cutoff) . "...";
	235	}
	236
	237	# If the pre text has a font-ending not preceded by a font beginning
	238	# (i.e. we've chopped the beginning off), we must insert a beginning.
	239
	240	while ($pretext =~ /^(.*?)<\/(small\|tt\|b\|i)>/ && $1 !~ /<$2>/)
	241	{
	242	$pretext = "<$2>" . $pretext;
	243	}
	244
	245	# If the pre text ends in a special font, we have to terminate that,
	246	# and reset it at the start of the post text.
	247
	248	my($poststart) = "";
	249
	250	while ($pretext =~ /<(small\|tt\|b\|i)>(?!.*?<\/\1>)/)
	251	{
	252	$pretext .= "</$1>";
	253	$poststart .= "<$1>";
	254	}
	255
	256	# If the post text changes font but doesn't close it, we must add
	257	# the closure.
	258
	259	while ($posttext =~ /<(small\|tt\|b\|i)>(?!.*?<\/\1>)/)
	260	{
	261	$posttext .= "</$1>";
	262	}
	263
	264	# Remove any unnecessary changes in either of them
	265
	266	$pretext =~ s/<(small\|tt\|b\|i)>\s*<\/\1>//g;
	267	$posttext =~ s/<(small\|tt\|b\|i)>\s*<\/\1>//g;
	268
	269	# Save the texts in associative arrays. Add the question number to
	270	# the end of the word to make the key.
	271
	272	$pre =~ /(Q\d\d\d\d)/;
	273	my($key) = "$word-$1";
	274
	275	$tableft{$key} = $pre . $pretext;
	276	$tabright{$key} = $poststart .
	277	"<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
	278	}
	279	}
	280
	281	close(IN);
	282
	283	# Now write out the files. Each letter in the index goes in a different file
	284
	285	$current_initial = "";
	286
	287	foreach $key (sort keys %tableft)
	288	{
	289	my($initial) = $key =~ /^(.)/;
	290	$initial = "\U$initial\E";
	291
	292	if ($initial ne $current_initial)
	293	{
	294	if ($current_initial ne "")
	295	{
	296	print OUT "</table>\n";
	297	&write_initials($current_initial);
	298	print OUT "</body>\n</html>\n";
	299	close OUT;
	300	}
	301
	302	open (OUT, ">html/FAQ-KWIC_$initial.html") \|\|
	303	die "Can't open html/FAQ-KWIC_$initial.html\n";
	304	print OUT
	305	"<html>\n" .
	306	"<head>\n" .
	307	"<title>Exim FAQ: KWIC index section $initial</title>\n" .
	308	"</head>\n" .
	309	"<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
	310	"<h1>Exim FAQ: Keyword-in-context index</h1>\n";
	311
	312	write_initials($initial);
	313
	314	if ($initial eq "A")
	315	{
	316	print OUT <<End ;
	317	<p>
	318	This <i>Keyword-in-context</i> index for the Exim FAQ is generated
	319	automatically from the FAQ source. Browsers may not display the data very
	320	prettily, but it is hoped that it may provide a useful aid for finding things
	321	in the FAQ.
	322	</p>
	323	End
	324	}
	325
	326	print OUT "<table border>\n";
	327	$current_initial = $initial;
	328	}
	329
	330	print OUT "<tr>\n";
	331	print OUT "<td align=\"right\">$tableft{$key}</td>\n";
	332	print OUT "<td align=\"left\">$tabright{$key}</td>\n";
	333	print OUT "</tr>\n";
	334	}
	335
	336	# Close the final file
	337
	338	if ($current_initial ne "")
	339	{
	340	print OUT "</table>\n";
	341	&write_initials($current_initial);
	342	print OUT "</body>\n</html>\n";
	343	close OUT;
	344	}
	345
	346	# End