[exim.git] / doc / doc-scripts / fc2k

#! /usr/bin/perl -w
# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $

# Script to read the HTML table of contents for the Exim FAQ and create an
# HTML KWIC index out of it.


########################################################################
# List of words to ignore - kept alphabetically for reference, but they
# don't have to be in order.

$ignore_list = "

a ability able about address addresses addressed affect affected
after against all allow allowed allows already also although always am an and
and/or any anybody anyone anything anywhere are aren't arrange arrive as at

back bad based basically be because been behave behaviour being best between
bob both bug build builds built busy but by

call called calls can can't cannot causes causing central certain code comes
coming command commands complain complaining complains configure configured
conjunction contact contain contains contained correct correctly could
currently customer

day days defined deliver delivers delivered delivery deliveries did do does
doesn't doing don't down during

e-mail e-mails each easy else email emails entirely entries entry especially
etc even ever every example exim exim's experiencing

far few file files find fine fly following for form found from fully

get gets getting given gives giving go goes going got

handle handles handled handling happen happens has have haven't having helpful
him host hosts how however

i i'd i'm i've if in indeed instead into is issue issues isn't it it's its

jim just

keep keeps know knows

like line lines look looked looking lot

machine machines machine's mail mails main make me mean means message messages
might more must my myself

near need neither no nor not now

occur of off often ok on one only or other our out over own

part parts particular per place possibility possible present problem problems
put puts

quite

raised rather really reason rid right round run runs

same say saying see seeing seem seems seen sees set setting she should so some
somehow something sometimes stand state statement still strange such supposed
system systems

take takes than that the their them then there these they things think this
those to try though to/for told too tried tries trying

under until up use uses used using usually

valid value values via

want wanted wanting was way we we've well what what's when where whereabouts
whenever whether which while who whose why will with within without wish won't
wondered work worked working works would wrong

xxx

yet yyy

";
########################################################################


# The regular expression fragment that defines the separator between words

$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";


########################################################################
# Function to add to a length to accommodate HTML stuff

sub setlen{
my($len, $s) = @_;

$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
$len += 1 while ($s =~ /&#\d+;/g);

return $len;
}


########################################################################
# Function to write out the list of initials with references

sub write_initials {
my($this_initial) = "$_[0]";

print OUT "<p>\n&nbsp;&nbsp;";

foreach $initial (sort keys %initials)
  {
  if ($initial eq $this_initial)
    {
    print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
    }
  else
    {
    print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
    }
  }

print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
}


########################################################################
# The main program. We can pick out the contents lines because they lie
# between <li> and </li> in the file, sometimes on more than one physical
# line.

# Turn the list of ignorable words into a hash for quick lookup. Add the
# empty word to the list.

@words = split /\s+/, $ignore_list;
foreach $word (@words) { $ignore{$word} = 1; }
$ignore{""} = 1;


# Open the file and do the job

open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";

while (<IN>)
  {
  next unless /^<li>/;
  $_ .= <IN> while !/<\/li>$/;
  chomp;
  s/\n\s*/ /g;

  # Extract the operative text into $text, with the beginning in $pre.

  my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;

  # Now split into words. As well as punctuation, there may be HTML thingies
  # between words. Absorb them into the separators.

  my(@words) = split /$wordgap/, $text;

  # Lower case all the words, and remove those that we don't want.
  # Then keep a list of all the used initials.

  REMOVE_IGNORE:
  for ($i = 0; $i < scalar @words; $i++)
    {
    my($word) = $words[$i] = "\L$words[$i]\E";

    # Remove certain forms of word and those on the ignore list

    if (defined $ignore{$word} ||  # word on ignore list
        $word =~ /^-+$/        ||  # word consists entirely of hyphens
        $word =~ /^-[^a-z]/    ||  # follows leading hyphen with non-letter
        $word =~ /^[^a-z-]/    ||  # starts with a non-letter or hyphen
        $word =~ /[@^.]/           # contains @ or ^ or .
       )
      {
      splice(@words, $i, 1);
      redo REMOVE_IGNORE if $i < scalar @words;
      }
      
    # Otherwise, build up a list of initials
     
    else
      {
      my($inword) = $word; 
      $inword =~ s/^-//; 
      $initial = substr($inword, 0, 1);
      $initials{"\U$initial\E"} = 1;
      }
    }

  # Create the lines for the KWIC index, and store them in associative
  # arrays, with the keyword as the key. That will get them sorted
  # automatically.

  while (scalar @words > 0)
    {
    my($word) = shift @words;
    my($pretext, $casedword, $posttext) =
      $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
      
    # Remove a leading hyphen from $word so that it sorts according to
    # the leading letter. What is actually output is $casedword, which
    # retains the hyphen.
    
    $word =~ s/^-//;   

    my($prelen) = length $pretext;
    my($postlen) = length $posttext;

    # We want to chop excessively long entries on either side. We can't set
    # a fixed length because of the HTML control data. Call a function to
    # add the given length to allow for HTML stuff. This is crude, but it
    # does roughtly the right thing.

    my($leftlen) = &setlen(70, $pretext);
    my($rightlen) = &setlen(70, $posttext);

    if ($prelen > $leftlen)
      {
      my($cutoff) = $leftlen;
      $cutoff++
        while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
      $pretext = "... " . substr($pretext, -$cutoff);
      }

    if ($postlen > $rightlen)
      {
      my($cutoff) = $rightlen;
      $cutoff++
        while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
      $posttext = substr($posttext, 0, $cutoff) . "...";
      }

    # If the pre text has a font-ending not preceded by a font beginning
    # (i.e. we've chopped the beginning off), we must insert a beginning.

    while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
      {
      $pretext = "<$2>" . $pretext;
      }

    # If the pre text ends in a special font, we have to terminate that,
    # and reset it at the start of the post text.

    my($poststart) = "";

    while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
      {
      $pretext .= "</$1>";
      $poststart .= "<$1>";
      }

    # If the post text changes font but doesn't close it, we must add
    # the closure.

    while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
      {
      $posttext .= "</$1>";
      }

    # Remove any unnecessary changes in either of them

    $pretext  =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
    $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;

    # Save the texts in associative arrays. Add the question number to
    # the end of the word to make the key.

    $pre =~ /(Q\d\d\d\d)/;
    my($key) = "$word-$1";

    $tableft{$key}  = $pre . $pretext;
    $tabright{$key} = $poststart .
      "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
    }
  }

close(IN);

# Now write out the files. Each letter in the index goes in a different file

$current_initial = "";

foreach $key (sort keys %tableft)
  {
  my($initial) = $key =~ /^(.)/;
  $initial = "\U$initial\E";

  if ($initial ne $current_initial)
    {
    if ($current_initial ne "")
      {
      print OUT "</table>\n";
      &write_initials($current_initial);
      print OUT "</body>\n</html>\n";
      close OUT;
      }

    open (OUT, ">html/FAQ-KWIC_$initial.html") ||
      die "Can't open html/FAQ-KWIC_$initial.html\n";
    print OUT
      "<html>\n" .
      "<head>\n" .
      "<title>Exim FAQ: KWIC index section $initial</title>\n" .
      "</head>\n" .
      "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
      "<h1>Exim FAQ: Keyword-in-context index</h1>\n";

    write_initials($initial);

    if ($initial eq "A")
      {
      print OUT <<End ;
<p>
This <i>Keyword-in-context</i> index for the Exim FAQ is generated
automatically from the FAQ source. Browsers may not display the data very
prettily, but it is hoped that it may provide a useful aid for finding things
in the FAQ.
</p>
End
      }

    print OUT "<table border>\n";
    $current_initial = $initial;
    }

  print OUT "<tr>\n";
  print OUT "<td align=\"right\">$tableft{$key}</td>\n";
  print OUT "<td align=\"left\">$tabright{$key}</td>\n";
  print OUT "</tr>\n";
  }

# Close the final file

if ($current_initial ne "")
  {
  print OUT "</table>\n";
  &write_initials($current_initial);
  print OUT "</body>\n</html>\n";
  close OUT;
  }

# End
Commit	Line	Data
495ae4b0 PH	1	#! /usr/bin/perl -w
	2	# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $
	3
	4	# Script to read the HTML table of contents for the Exim FAQ and create an
	5	# HTML KWIC index out of it.
	6
	7
	8	########################################################################
	9	# List of words to ignore - kept alphabetically for reference, but they
	10	# don't have to be in order.
	11
	12	$ignore_list = "
	13
	14	a ability able about address addresses addressed affect affected
	15	after against all allow allowed allows already also although always am an and
	16	and/or any anybody anyone anything anywhere are aren't arrange arrive as at
	17
	18	back bad based basically be because been behave behaviour being best between
	19	bob both bug build builds built busy but by
	20
	21	call called calls can can't cannot causes causing central certain code comes
	22	coming command commands complain complaining complains configure configured
	23	conjunction contact contain contains contained correct correctly could
	24	currently customer
	25
	26	day days defined deliver delivers delivered delivery deliveries did do does
	27	doesn't doing don't down during
	28
	29	e-mail e-mails each easy else email emails entirely entries entry especially
	30	etc even ever every example exim exim's experiencing
	31
	32	far few file files find fine fly following for form found from fully
	33
	34	get gets getting given gives giving go goes going got
	35
	36	handle handles handled handling happen happens has have haven't having helpful
	37	him host hosts how however
	38
	39	i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
	40
	41	jim just
	42
	43	keep keeps know knows
	44
	45	like line lines look looked looking lot
	46
	47	machine machines machine's mail mails main make me mean means message messages
	48	might more must my myself
	49
	50	near need neither no nor not now
	51
	52	occur of off often ok on one only or other our out over own
	53
	54	part parts particular per place possibility possible present problem problems
	55	put puts
	56
	57	quite
	58
	59	raised rather really reason rid right round run runs
	60
	61	same say saying see seeing seem seems seen sees set setting she should so some
	62	somehow something sometimes stand state statement still strange such supposed
	63	system systems
	64
65	take takes than that the their them then there these they things think this
66	those to try though to/for told too tried tries trying
67
68	under until up use uses used using usually
69
70	valid value values via
71
72	want wanted wanting was way we we've well what what's when where whereabouts
73	whenever whether which while who whose why will with within without wish won't
74	wondered work worked working works would wrong
75
76	xxx
77
78	yet yyy
79
80	";
81	########################################################################
82
83
84	# The regular expression fragment that defines the separator between words
85
86	$wordgap = "(?:[]().?,;:\"']\|(?><[^>]>))(?:\\s+\|\$)(?:[[(\"'`]\|(?><[^>]>))";
87
88
89	########################################################################
90	# Function to add to a length to accommodate HTML stuff
91
92	sub setlen{
93	my($len, $s) = @_;
94
95	$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
96	$len += 1 while ($s =~ /&#\d+;/g);
97
98	return $len;
99	}
100
101
102	########################################################################
103	# Function to write out the list of initials with references
104
105	sub write_initials {
106	my($this_initial) = "$_[0]";
107
108	print OUT "<p>\n  ";
109
110	foreach $initial (sort keys %initials)
111	{
112	if ($initial eq $this_initial)
113	{
114	print OUT "   <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
115	}
116	else
117	{
118	print OUT "<a href=\"FAQ-KWIC_$initial.html\">  $initial</a>";
119	}
120	}
121
122	print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
123	}
124
125
126
127	########################################################################
128	# The main program. We can pick out the contents lines because they lie
129	# between <li> and </li> in the file, sometimes on more than one physical
130	# line.
131
132	# Turn the list of ignorable words into a hash for quick lookup. Add the
133	# empty word to the list.
134
135	@words = split /\s+/, $ignore_list;
136	foreach $word (@words) { $ignore{$word} = 1; }
137	$ignore{""} = 1;
138
139
140	# Open the file and do the job
141
142	open(IN, "html/FAQ.html") \|\| die "Can't open html/FAQ.html\n";
143
144	while (<IN>)
145	{
146	next unless /^<li>/;
147	$_ .= <IN> while !/<\/li>$/;
148	chomp;
149	s/\n\s*/ /g;
150
151	# Extract the operative text into $text, with the beginning in $pre.
152
153	my($pre,$text,$post) = /^<li>(.<\/a>:(?: ))(.*)<br><br><\/li>$/;
154
155	# Now split into words. As well as punctuation, there may be HTML thingies
156	# between words. Absorb them into the separators.
157
158	my(@words) = split /$wordgap/, $text;
159
160	# Lower case all the words, and remove those that we don't want.
161	# Then keep a list of all the used initials.
162
163	REMOVE_IGNORE:
164	for ($i = 0; $i < scalar @words; $i++)
165	{
166	my($word) = $words[$i] = "\L$words[$i]\E";
167
168	# Remove certain forms of word and those on the ignore list
169
170	if (defined $ignore{$word} \|\| # word on ignore list
171	$word =~ /^-+$/ \|\| # word consists entirely of hyphens
172	$word =~ /^-[^a-z]/ \|\| # follows leading hyphen with non-letter
173	$word =~ /^[^a-z-]/ \|\| # starts with a non-letter or hyphen
174	$word =~ /[@^.]/ # contains @ or ^ or .
175	)
176	{
177	splice(@words, $i, 1);
178	redo REMOVE_IGNORE if $i < scalar @words;
179	}
180
181	# Otherwise, build up a list of initials
182
183	else
184	{
185	my($inword) = $word;
186	$inword =~ s/^-//;
187	$initial = substr($inword, 0, 1);
188	$initials{"\U$initial\E"} = 1;
189	}
190	}
191
192	# Create the lines for the KWIC index, and store them in associative
193	# arrays, with the keyword as the key. That will get them sorted
194	# automatically.
195
196	while (scalar @words > 0)
197	{
198	my($word) = shift @words;
199	my($pretext, $casedword, $posttext) =
200	$text =~ /(.?)(?<![a-z])(\Q$word\E)(?![a-z])(.)/i;
201
202	# Remove a leading hyphen from $word so that it sorts according to
203	# the leading letter. What is actually output is $casedword, which
204	# retains the hyphen.
205
206	$word =~ s/^-//;
207
208	my($prelen) = length $pretext;
209	my($postlen) = length $posttext;
210
211	# We want to chop excessively long entries on either side. We can't set
212	# a fixed length because of the HTML control data. Call a function to
213	# add the given length to allow for HTML stuff. This is crude, but it
214	# does roughtly the right thing.
215
216	my($leftlen) = &setlen(70, $pretext);
217	my($rightlen) = &setlen(70, $posttext);
218
219	if ($prelen > $leftlen)
220	{
221	my($cutoff) = $leftlen;
222	$cutoff++
223	while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
224	$pretext = "... " . substr($pretext, -$cutoff);
225	}
226
227	if ($postlen > $rightlen)
228	{
229	my($cutoff) = $rightlen;
230	$cutoff++
231	while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
232	$posttext = substr($posttext, 0, $cutoff) . "...";
233	}
234
235	# If the pre text has a font-ending not preceded by a font beginning
236	# (i.e. we've chopped the beginning off), we must insert a beginning.
237
238	while ($pretext =~ /^(.*?)<\/(small\|tt\|b\|i)>/ && $1 !~ /<$2>/)
239	{
240	$pretext = "<$2>" . $pretext;
241	}
242
243	# If the pre text ends in a special font, we have to terminate that,
244	# and reset it at the start of the post text.
245
246	my($poststart) = "";
247
248	while ($pretext =~ /<(small\|tt\|b\|i)>(?!.*?<\/\1>)/)
249	{
250	$pretext .= "</$1>";
251	$poststart .= "<$1>";
252	}
253
254	# If the post text changes font but doesn't close it, we must add
255	# the closure.
256
257	while ($posttext =~ /<(small\|tt\|b\|i)>(?!.*?<\/\1>)/)
258	{
259	$posttext .= "</$1>";
260	}
261
262	# Remove any unnecessary changes in either of them
263
264	$pretext =~ s/<(small\|tt\|b\|i)>\s*<\/\1>//g;
265	$posttext =~ s/<(small\|tt\|b\|i)>\s*<\/\1>//g;
266
267	# Save the texts in associative arrays. Add the question number to
268	# the end of the word to make the key.
269
270	$pre =~ /(Q\d\d\d\d)/;
271	my($key) = "$word-$1";
272
273	$tableft{$key} = $pre . $pretext;
274	$tabright{$key} = $poststart .
275	"<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
276	}
277	}
278
279	close(IN);
280
281	# Now write out the files. Each letter in the index goes in a different file
282
283	$current_initial = "";
284
285	foreach $key (sort keys %tableft)
286	{
287	my($initial) = $key =~ /^(.)/;
288	$initial = "\U$initial\E";
289
290	if ($initial ne $current_initial)
291	{
292	if ($current_initial ne "")
293	{
294	print OUT "</table>\n";
295	&write_initials($current_initial);
296	print OUT "</body>\n</html>\n";
297	close OUT;
298	}
299
300	open (OUT, ">html/FAQ-KWIC_$initial.html") \|\|
301	die "Can't open html/FAQ-KWIC_$initial.html\n";
302	print OUT
303	"<html>\n" .
304	"<head>\n" .
305	"<title>Exim FAQ: KWIC index section $initial</title>\n" .
306	"</head>\n" .
307	"<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
308	"<h1>Exim FAQ: Keyword-in-context index</h1>\n";
309
310	write_initials($initial);
311
312	if ($initial eq "A")
313	{
314	print OUT <<End ;
315	<p>
316	This <i>Keyword-in-context</i> index for the Exim FAQ is generated
317	automatically from the FAQ source. Browsers may not display the data very
318	prettily, but it is hoped that it may provide a useful aid for finding things
319	in the FAQ.
320	</p>
321	End
322	}
323
324	print OUT "<table border>\n";
325	$current_initial = $initial;
326	}
327
328	print OUT "<tr>\n";
329	print OUT "<td align=\"right\">$tableft{$key}</td>\n";
330	print OUT "<td align=\"left\">$tabright{$key}</td>\n";
331	print OUT "</tr>\n";
332	}
333
334	# Close the final file
335
336	if ($current_initial ne "")
337	{
338	print OUT "</table>\n";
339	&write_initials($current_initial);
340	print OUT "</body>\n</html>\n";
341	close OUT;
342	}
343
344	# End