Start
[exim.git] / doc / doc-scripts / fc2k
1 #! /usr/bin/perl -w
2 # $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $
3
4 # Script to read the HTML table of contents for the Exim FAQ and create an
5 # HTML KWIC index out of it.
6
7
8 ########################################################################
9 # List of words to ignore - kept alphabetically for reference, but they
10 # don't have to be in order.
11
12 $ignore_list = "
13
14 a ability able about address addresses addressed affect affected
15 after against all allow allowed allows already also although always am an and
16 and/or any anybody anyone anything anywhere are aren't arrange arrive as at
17
18 back bad based basically be because been behave behaviour being best between
19 bob both bug build builds built busy but by
20
21 call called calls can can't cannot causes causing central certain code comes
22 coming command commands complain complaining complains configure configured
23 conjunction contact contain contains contained correct correctly could
24 currently customer
25
26 day days defined deliver delivers delivered delivery deliveries did do does
27 doesn't doing don't down during
28
29 e-mail e-mails each easy else email emails entirely entries entry especially
30 etc even ever every example exim exim's experiencing
31
32 far few file files find fine fly following for form found from fully
33
34 get gets getting given gives giving go goes going got
35
36 handle handles handled handling happen happens has have haven't having helpful
37 him host hosts how however
38
39 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
40
41 jim just
42
43 keep keeps know knows
44
45 like line lines look looked looking lot
46
47 machine machines machine's mail mails main make me mean means message messages
48 might more must my myself
49
50 near need neither no nor not now
51
52 occur of off often ok on one only or other our out over own
53
54 part parts particular per place possibility possible present problem problems
55 put puts
56
57 quite
58
59 raised rather really reason rid right round run runs
60
61 same say saying see seeing seem seems seen sees set setting she should so some
62 somehow something sometimes stand state statement still strange such supposed
63 system systems
64
65 take takes than that the their them then there these they things think this
66 those to try though to/for told too tried tries trying
67
68 under until up use uses used using usually
69
70 valid value values via
71
72 want wanted wanting was way we we've well what what's when where whereabouts
73 whenever whether which while who whose why will with within without wish won't
74 wondered work worked working works would wrong
75
76 xxx
77
78 yet yyy
79
80 ";
81 ########################################################################
82
83
84 # The regular expression fragment that defines the separator between words
85
86 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
87
88
89 ########################################################################
90 # Function to add to a length to accommodate HTML stuff
91
92 sub setlen{
93 my($len, $s) = @_;
94
95 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
96 $len += 1 while ($s =~ /&#\d+;/g);
97
98 return $len;
99 }
100
101
102 ########################################################################
103 # Function to write out the list of initials with references
104
105 sub write_initials {
106 my($this_initial) = "$_[0]";
107
108 print OUT "<p>\n&nbsp;&nbsp;";
109
110 foreach $initial (sort keys %initials)
111 {
112 if ($initial eq $this_initial)
113 {
114 print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
115 }
116 else
117 {
118 print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
119 }
120 }
121
122 print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
123 }
124
125
126
127 ########################################################################
128 # The main program. We can pick out the contents lines because they lie
129 # between <li> and </li> in the file, sometimes on more than one physical
130 # line.
131
132 # Turn the list of ignorable words into a hash for quick lookup. Add the
133 # empty word to the list.
134
135 @words = split /\s+/, $ignore_list;
136 foreach $word (@words) { $ignore{$word} = 1; }
137 $ignore{""} = 1;
138
139
140 # Open the file and do the job
141
142 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
143
144 while (<IN>)
145 {
146 next unless /^<li>/;
147 $_ .= <IN> while !/<\/li>$/;
148 chomp;
149 s/\n\s*/ /g;
150
151 # Extract the operative text into $text, with the beginning in $pre.
152
153 my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;
154
155 # Now split into words. As well as punctuation, there may be HTML thingies
156 # between words. Absorb them into the separators.
157
158 my(@words) = split /$wordgap/, $text;
159
160 # Lower case all the words, and remove those that we don't want.
161 # Then keep a list of all the used initials.
162
163 REMOVE_IGNORE:
164 for ($i = 0; $i < scalar @words; $i++)
165 {
166 my($word) = $words[$i] = "\L$words[$i]\E";
167
168 # Remove certain forms of word and those on the ignore list
169
170 if (defined $ignore{$word} || # word on ignore list
171 $word =~ /^-+$/ || # word consists entirely of hyphens
172 $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
173 $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
174 $word =~ /[@^.]/ # contains @ or ^ or .
175 )
176 {
177 splice(@words, $i, 1);
178 redo REMOVE_IGNORE if $i < scalar @words;
179 }
180
181 # Otherwise, build up a list of initials
182
183 else
184 {
185 my($inword) = $word;
186 $inword =~ s/^-//;
187 $initial = substr($inword, 0, 1);
188 $initials{"\U$initial\E"} = 1;
189 }
190 }
191
192 # Create the lines for the KWIC index, and store them in associative
193 # arrays, with the keyword as the key. That will get them sorted
194 # automatically.
195
196 while (scalar @words > 0)
197 {
198 my($word) = shift @words;
199 my($pretext, $casedword, $posttext) =
200 $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
201
202 # Remove a leading hyphen from $word so that it sorts according to
203 # the leading letter. What is actually output is $casedword, which
204 # retains the hyphen.
205
206 $word =~ s/^-//;
207
208 my($prelen) = length $pretext;
209 my($postlen) = length $posttext;
210
211 # We want to chop excessively long entries on either side. We can't set
212 # a fixed length because of the HTML control data. Call a function to
213 # add the given length to allow for HTML stuff. This is crude, but it
214 # does roughtly the right thing.
215
216 my($leftlen) = &setlen(70, $pretext);
217 my($rightlen) = &setlen(70, $posttext);
218
219 if ($prelen > $leftlen)
220 {
221 my($cutoff) = $leftlen;
222 $cutoff++
223 while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
224 $pretext = "... " . substr($pretext, -$cutoff);
225 }
226
227 if ($postlen > $rightlen)
228 {
229 my($cutoff) = $rightlen;
230 $cutoff++
231 while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
232 $posttext = substr($posttext, 0, $cutoff) . "...";
233 }
234
235 # If the pre text has a font-ending not preceded by a font beginning
236 # (i.e. we've chopped the beginning off), we must insert a beginning.
237
238 while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
239 {
240 $pretext = "<$2>" . $pretext;
241 }
242
243 # If the pre text ends in a special font, we have to terminate that,
244 # and reset it at the start of the post text.
245
246 my($poststart) = "";
247
248 while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
249 {
250 $pretext .= "</$1>";
251 $poststart .= "<$1>";
252 }
253
254 # If the post text changes font but doesn't close it, we must add
255 # the closure.
256
257 while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
258 {
259 $posttext .= "</$1>";
260 }
261
262 # Remove any unnecessary changes in either of them
263
264 $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
265 $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
266
267 # Save the texts in associative arrays. Add the question number to
268 # the end of the word to make the key.
269
270 $pre =~ /(Q\d\d\d\d)/;
271 my($key) = "$word-$1";
272
273 $tableft{$key} = $pre . $pretext;
274 $tabright{$key} = $poststart .
275 "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
276 }
277 }
278
279 close(IN);
280
281 # Now write out the files. Each letter in the index goes in a different file
282
283 $current_initial = "";
284
285 foreach $key (sort keys %tableft)
286 {
287 my($initial) = $key =~ /^(.)/;
288 $initial = "\U$initial\E";
289
290 if ($initial ne $current_initial)
291 {
292 if ($current_initial ne "")
293 {
294 print OUT "</table>\n";
295 &write_initials($current_initial);
296 print OUT "</body>\n</html>\n";
297 close OUT;
298 }
299
300 open (OUT, ">html/FAQ-KWIC_$initial.html") ||
301 die "Can't open html/FAQ-KWIC_$initial.html\n";
302 print OUT
303 "<html>\n" .
304 "<head>\n" .
305 "<title>Exim FAQ: KWIC index section $initial</title>\n" .
306 "</head>\n" .
307 "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
308 "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
309
310 write_initials($initial);
311
312 if ($initial eq "A")
313 {
314 print OUT <<End ;
315 <p>
316 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
317 automatically from the FAQ source. Browsers may not display the data very
318 prettily, but it is hoped that it may provide a useful aid for finding things
319 in the FAQ.
320 </p>
321 End
322 }
323
324 print OUT "<table border>\n";
325 $current_initial = $initial;
326 }
327
328 print OUT "<tr>\n";
329 print OUT "<td align=\"right\">$tableft{$key}</td>\n";
330 print OUT "<td align=\"left\">$tabright{$key}</td>\n";
331 print OUT "</tr>\n";
332 }
333
334 # Close the final file
335
336 if ($current_initial ne "")
337 {
338 print OUT "</table>\n";
339 &write_initials($current_initial);
340 print OUT "</body>\n</html>\n";
341 close OUT;
342 }
343
344 # End