NM/01
[exim.git] / doc / doc-scripts / fc2k
CommitLineData
495ae4b0 1#! /usr/bin/perl -w
8e26e4bf 2# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.2 2004/10/14 09:53:11 ph10 Exp $
495ae4b0
PH
3
4# Script to read the HTML table of contents for the Exim FAQ and create an
5# HTML KWIC index out of it.
6
7
8########################################################################
9# List of words to ignore - kept alphabetically for reference, but they
10# don't have to be in order.
11
12$ignore_list = "
13
8e26e4bf
PH
14a ability able about absence access according actual address addresses addressed
15affect affected after against aka all allow allowed allows along already also
16although always am amount an ancient and and/or annoying another any anybody
17anyone anything anywhere apparent apparently are aren't around arrange arrive
18arrives as at
495ae4b0
PH
19
20back bad based basically be because been behave behaviour being best between
8e26e4bf 21bob both box bug build builds built busy but by
495ae4b0
PH
22
23call called calls can can't cannot causes causing central certain code comes
24coming command commands complain complaining complains configure configured
25conjunction contact contain contains contained correct correctly could
26currently customer
27
28day days defined deliver delivers delivered delivery deliveries did do does
29doesn't doing don't down during
30
8e26e4bf
PH
31e-mail e-mails each easy either else email emails entirely entries entry
32especially etc even ever every example exim exim's experiencing
495ae4b0 33
8e26e4bf
PH
34far few file files find finds fine fix fixed fly following for form found from
35fully
495ae4b0 36
8e26e4bf 37generate generated get gets getting given gives giving go goes going got
495ae4b0
PH
38
39handle handles handled handling happen happens has have haven't having helpful
40him host hosts how however
41
42i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
43
44jim just
45
46keep keeps know knows
47
48like line lines look looked looking lot
49
8e26e4bf
PH
50m machine machines machine's mail mails main make me mean means message messages
51might more much must my myself
495ae4b0
PH
52
53near need neither no nor not now
54
55occur of off often ok on one only or other our out over own
56
57part parts particular per place possibility possible present problem problems
58put puts
59
60quite
61
62raised rather really reason rid right round run runs
63
8e26e4bf
PH
64same say saying see seeing seem seems seen sees set setting she should simply
65sit so some somehow something sometimes stand state statement still strange such
66supposed system systems
495ae4b0 67
8e26e4bf
PH
68take takes tell than that the their them then there these they things think this
69those thought to try though to/for told too tried tries trying
495ae4b0
PH
70
71under until up use uses used using usually
72
73valid value values via
74
75want wanted wanting was way we we've well what what's when where whereabouts
76whenever whether which while who whose why will with within without wish won't
77wondered work worked working works would wrong
78
79xxx
80
81yet yyy
82
83";
84########################################################################
85
86
87# The regular expression fragment that defines the separator between words
88
89$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
90
91
92########################################################################
93# Function to add to a length to accommodate HTML stuff
94
95sub setlen{
96my($len, $s) = @_;
97
98$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
99$len += 1 while ($s =~ /&#\d+;/g);
100
101return $len;
102}
103
104
105########################################################################
106# Function to write out the list of initials with references
107
108sub write_initials {
109my($this_initial) = "$_[0]";
110
111print OUT "<p>\n&nbsp;&nbsp;";
112
113foreach $initial (sort keys %initials)
114 {
115 if ($initial eq $this_initial)
116 {
117 print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
118 }
119 else
120 {
121 print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
122 }
123 }
124
125print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
126}
127
128
129
130########################################################################
131# The main program. We can pick out the contents lines because they lie
132# between <li> and </li> in the file, sometimes on more than one physical
133# line.
134
135# Turn the list of ignorable words into a hash for quick lookup. Add the
136# empty word to the list.
137
138@words = split /\s+/, $ignore_list;
139foreach $word (@words) { $ignore{$word} = 1; }
140$ignore{""} = 1;
141
142
143# Open the file and do the job
144
145open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
146
147while (<IN>)
148 {
149 next unless /^<li>/;
150 $_ .= <IN> while !/<\/li>$/;
151 chomp;
152 s/\n\s*/ /g;
153
154 # Extract the operative text into $text, with the beginning in $pre.
155
156 my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;
157
158 # Now split into words. As well as punctuation, there may be HTML thingies
159 # between words. Absorb them into the separators.
160
161 my(@words) = split /$wordgap/, $text;
162
163 # Lower case all the words, and remove those that we don't want.
164 # Then keep a list of all the used initials.
165
166 REMOVE_IGNORE:
167 for ($i = 0; $i < scalar @words; $i++)
168 {
169 my($word) = $words[$i] = "\L$words[$i]\E";
170
171 # Remove certain forms of word and those on the ignore list
172
173 if (defined $ignore{$word} || # word on ignore list
174 $word =~ /^-+$/ || # word consists entirely of hyphens
175 $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
176 $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
177 $word =~ /[@^.]/ # contains @ or ^ or .
178 )
179 {
180 splice(@words, $i, 1);
181 redo REMOVE_IGNORE if $i < scalar @words;
182 }
183
184 # Otherwise, build up a list of initials
185
186 else
187 {
188 my($inword) = $word;
189 $inword =~ s/^-//;
190 $initial = substr($inword, 0, 1);
191 $initials{"\U$initial\E"} = 1;
192 }
193 }
194
195 # Create the lines for the KWIC index, and store them in associative
196 # arrays, with the keyword as the key. That will get them sorted
197 # automatically.
198
199 while (scalar @words > 0)
200 {
201 my($word) = shift @words;
202 my($pretext, $casedword, $posttext) =
203 $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
204
205 # Remove a leading hyphen from $word so that it sorts according to
206 # the leading letter. What is actually output is $casedword, which
207 # retains the hyphen.
208
209 $word =~ s/^-//;
210
211 my($prelen) = length $pretext;
212 my($postlen) = length $posttext;
213
214 # We want to chop excessively long entries on either side. We can't set
215 # a fixed length because of the HTML control data. Call a function to
216 # add the given length to allow for HTML stuff. This is crude, but it
217 # does roughtly the right thing.
218
219 my($leftlen) = &setlen(70, $pretext);
220 my($rightlen) = &setlen(70, $posttext);
221
222 if ($prelen > $leftlen)
223 {
224 my($cutoff) = $leftlen;
225 $cutoff++
226 while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
227 $pretext = "... " . substr($pretext, -$cutoff);
228 }
229
230 if ($postlen > $rightlen)
231 {
232 my($cutoff) = $rightlen;
233 $cutoff++
234 while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
235 $posttext = substr($posttext, 0, $cutoff) . "...";
236 }
237
238 # If the pre text has a font-ending not preceded by a font beginning
239 # (i.e. we've chopped the beginning off), we must insert a beginning.
240
241 while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
242 {
243 $pretext = "<$2>" . $pretext;
244 }
245
246 # If the pre text ends in a special font, we have to terminate that,
247 # and reset it at the start of the post text.
248
249 my($poststart) = "";
250
251 while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
252 {
253 $pretext .= "</$1>";
254 $poststart .= "<$1>";
255 }
256
257 # If the post text changes font but doesn't close it, we must add
258 # the closure.
259
260 while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
261 {
262 $posttext .= "</$1>";
263 }
264
265 # Remove any unnecessary changes in either of them
266
267 $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
268 $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
269
270 # Save the texts in associative arrays. Add the question number to
271 # the end of the word to make the key.
272
273 $pre =~ /(Q\d\d\d\d)/;
274 my($key) = "$word-$1";
275
276 $tableft{$key} = $pre . $pretext;
277 $tabright{$key} = $poststart .
278 "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
279 }
280 }
281
282close(IN);
283
284# Now write out the files. Each letter in the index goes in a different file
285
286$current_initial = "";
287
288foreach $key (sort keys %tableft)
289 {
290 my($initial) = $key =~ /^(.)/;
291 $initial = "\U$initial\E";
292
293 if ($initial ne $current_initial)
294 {
295 if ($current_initial ne "")
296 {
297 print OUT "</table>\n";
298 &write_initials($current_initial);
299 print OUT "</body>\n</html>\n";
300 close OUT;
301 }
302
303 open (OUT, ">html/FAQ-KWIC_$initial.html") ||
304 die "Can't open html/FAQ-KWIC_$initial.html\n";
305 print OUT
306 "<html>\n" .
307 "<head>\n" .
308 "<title>Exim FAQ: KWIC index section $initial</title>\n" .
309 "</head>\n" .
310 "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
311 "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
312
313 write_initials($initial);
314
315 if ($initial eq "A")
316 {
317 print OUT <<End ;
318<p>
319This <i>Keyword-in-context</i> index for the Exim FAQ is generated
320automatically from the FAQ source. Browsers may not display the data very
321prettily, but it is hoped that it may provide a useful aid for finding things
322in the FAQ.
323</p>
324End
325 }
326
327 print OUT "<table border>\n";
328 $current_initial = $initial;
329 }
330
331 print OUT "<tr>\n";
332 print OUT "<td align=\"right\">$tableft{$key}</td>\n";
333 print OUT "<td align=\"left\">$tabright{$key}</td>\n";
334 print OUT "</tr>\n";
335 }
336
337# Close the final file
338
339if ($current_initial ne "")
340 {
341 print OUT "</table>\n";
342 &write_initials($current_initial);
343 print OUT "</body>\n</html>\n";
344 close OUT;
345 }
346
347# End