Start
[exim.git] / doc / doc-scripts / fc2k
CommitLineData
495ae4b0
PH
1#! /usr/bin/perl -w
2# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $
3
4# Script to read the HTML table of contents for the Exim FAQ and create an
5# HTML KWIC index out of it.
6
7
8########################################################################
9# List of words to ignore - kept alphabetically for reference, but they
10# don't have to be in order.
11
12$ignore_list = "
13
14a ability able about address addresses addressed affect affected
15after against all allow allowed allows already also although always am an and
16and/or any anybody anyone anything anywhere are aren't arrange arrive as at
17
18back bad based basically be because been behave behaviour being best between
19bob both bug build builds built busy but by
20
21call called calls can can't cannot causes causing central certain code comes
22coming command commands complain complaining complains configure configured
23conjunction contact contain contains contained correct correctly could
24currently customer
25
26day days defined deliver delivers delivered delivery deliveries did do does
27doesn't doing don't down during
28
29e-mail e-mails each easy else email emails entirely entries entry especially
30etc even ever every example exim exim's experiencing
31
32far few file files find fine fly following for form found from fully
33
34get gets getting given gives giving go goes going got
35
36handle handles handled handling happen happens has have haven't having helpful
37him host hosts how however
38
39i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
40
41jim just
42
43keep keeps know knows
44
45like line lines look looked looking lot
46
47machine machines machine's mail mails main make me mean means message messages
48might more must my myself
49
50near need neither no nor not now
51
52occur of off often ok on one only or other our out over own
53
54part parts particular per place possibility possible present problem problems
55put puts
56
57quite
58
59raised rather really reason rid right round run runs
60
61same say saying see seeing seem seems seen sees set setting she should so some
62somehow something sometimes stand state statement still strange such supposed
63system systems
64
65take takes than that the their them then there these they things think this
66those to try though to/for told too tried tries trying
67
68under until up use uses used using usually
69
70valid value values via
71
72want wanted wanting was way we we've well what what's when where whereabouts
73whenever whether which while who whose why will with within without wish won't
74wondered work worked working works would wrong
75
76xxx
77
78yet yyy
79
80";
81########################################################################
82
83
84# The regular expression fragment that defines the separator between words
85
86$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
87
88
89########################################################################
90# Function to add to a length to accommodate HTML stuff
91
92sub setlen{
93my($len, $s) = @_;
94
95$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
96$len += 1 while ($s =~ /&#\d+;/g);
97
98return $len;
99}
100
101
102########################################################################
103# Function to write out the list of initials with references
104
105sub write_initials {
106my($this_initial) = "$_[0]";
107
108print OUT "<p>\n&nbsp;&nbsp;";
109
110foreach $initial (sort keys %initials)
111 {
112 if ($initial eq $this_initial)
113 {
114 print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
115 }
116 else
117 {
118 print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
119 }
120 }
121
122print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
123}
124
125
126
127########################################################################
128# The main program. We can pick out the contents lines because they lie
129# between <li> and </li> in the file, sometimes on more than one physical
130# line.
131
132# Turn the list of ignorable words into a hash for quick lookup. Add the
133# empty word to the list.
134
135@words = split /\s+/, $ignore_list;
136foreach $word (@words) { $ignore{$word} = 1; }
137$ignore{""} = 1;
138
139
140# Open the file and do the job
141
142open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
143
144while (<IN>)
145 {
146 next unless /^<li>/;
147 $_ .= <IN> while !/<\/li>$/;
148 chomp;
149 s/\n\s*/ /g;
150
151 # Extract the operative text into $text, with the beginning in $pre.
152
153 my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;
154
155 # Now split into words. As well as punctuation, there may be HTML thingies
156 # between words. Absorb them into the separators.
157
158 my(@words) = split /$wordgap/, $text;
159
160 # Lower case all the words, and remove those that we don't want.
161 # Then keep a list of all the used initials.
162
163 REMOVE_IGNORE:
164 for ($i = 0; $i < scalar @words; $i++)
165 {
166 my($word) = $words[$i] = "\L$words[$i]\E";
167
168 # Remove certain forms of word and those on the ignore list
169
170 if (defined $ignore{$word} || # word on ignore list
171 $word =~ /^-+$/ || # word consists entirely of hyphens
172 $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
173 $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
174 $word =~ /[@^.]/ # contains @ or ^ or .
175 )
176 {
177 splice(@words, $i, 1);
178 redo REMOVE_IGNORE if $i < scalar @words;
179 }
180
181 # Otherwise, build up a list of initials
182
183 else
184 {
185 my($inword) = $word;
186 $inword =~ s/^-//;
187 $initial = substr($inword, 0, 1);
188 $initials{"\U$initial\E"} = 1;
189 }
190 }
191
192 # Create the lines for the KWIC index, and store them in associative
193 # arrays, with the keyword as the key. That will get them sorted
194 # automatically.
195
196 while (scalar @words > 0)
197 {
198 my($word) = shift @words;
199 my($pretext, $casedword, $posttext) =
200 $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
201
202 # Remove a leading hyphen from $word so that it sorts according to
203 # the leading letter. What is actually output is $casedword, which
204 # retains the hyphen.
205
206 $word =~ s/^-//;
207
208 my($prelen) = length $pretext;
209 my($postlen) = length $posttext;
210
211 # We want to chop excessively long entries on either side. We can't set
212 # a fixed length because of the HTML control data. Call a function to
213 # add the given length to allow for HTML stuff. This is crude, but it
214 # does roughtly the right thing.
215
216 my($leftlen) = &setlen(70, $pretext);
217 my($rightlen) = &setlen(70, $posttext);
218
219 if ($prelen > $leftlen)
220 {
221 my($cutoff) = $leftlen;
222 $cutoff++
223 while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
224 $pretext = "... " . substr($pretext, -$cutoff);
225 }
226
227 if ($postlen > $rightlen)
228 {
229 my($cutoff) = $rightlen;
230 $cutoff++
231 while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
232 $posttext = substr($posttext, 0, $cutoff) . "...";
233 }
234
235 # If the pre text has a font-ending not preceded by a font beginning
236 # (i.e. we've chopped the beginning off), we must insert a beginning.
237
238 while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
239 {
240 $pretext = "<$2>" . $pretext;
241 }
242
243 # If the pre text ends in a special font, we have to terminate that,
244 # and reset it at the start of the post text.
245
246 my($poststart) = "";
247
248 while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
249 {
250 $pretext .= "</$1>";
251 $poststart .= "<$1>";
252 }
253
254 # If the post text changes font but doesn't close it, we must add
255 # the closure.
256
257 while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
258 {
259 $posttext .= "</$1>";
260 }
261
262 # Remove any unnecessary changes in either of them
263
264 $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
265 $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
266
267 # Save the texts in associative arrays. Add the question number to
268 # the end of the word to make the key.
269
270 $pre =~ /(Q\d\d\d\d)/;
271 my($key) = "$word-$1";
272
273 $tableft{$key} = $pre . $pretext;
274 $tabright{$key} = $poststart .
275 "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
276 }
277 }
278
279close(IN);
280
281# Now write out the files. Each letter in the index goes in a different file
282
283$current_initial = "";
284
285foreach $key (sort keys %tableft)
286 {
287 my($initial) = $key =~ /^(.)/;
288 $initial = "\U$initial\E";
289
290 if ($initial ne $current_initial)
291 {
292 if ($current_initial ne "")
293 {
294 print OUT "</table>\n";
295 &write_initials($current_initial);
296 print OUT "</body>\n</html>\n";
297 close OUT;
298 }
299
300 open (OUT, ">html/FAQ-KWIC_$initial.html") ||
301 die "Can't open html/FAQ-KWIC_$initial.html\n";
302 print OUT
303 "<html>\n" .
304 "<head>\n" .
305 "<title>Exim FAQ: KWIC index section $initial</title>\n" .
306 "</head>\n" .
307 "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
308 "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
309
310 write_initials($initial);
311
312 if ($initial eq "A")
313 {
314 print OUT <<End ;
315<p>
316This <i>Keyword-in-context</i> index for the Exim FAQ is generated
317automatically from the FAQ source. Browsers may not display the data very
318prettily, but it is hoped that it may provide a useful aid for finding things
319in the FAQ.
320</p>
321End
322 }
323
324 print OUT "<table border>\n";
325 $current_initial = $initial;
326 }
327
328 print OUT "<tr>\n";
329 print OUT "<td align=\"right\">$tableft{$key}</td>\n";
330 print OUT "<td align=\"left\">$tabright{$key}</td>\n";
331 print OUT "</tr>\n";
332 }
333
334# Close the final file
335
336if ($current_initial ne "")
337 {
338 print OUT "</table>\n";
339 &write_initials($current_initial);
340 print OUT "</body>\n</html>\n";
341 close OUT;
342 }
343
344# End