Commit | Line | Data |
---|---|---|
168e428f PH |
1 | #! /usr/bin/perl |
2 | ||
4f578862 | 3 | # $Cambridge: exim/doc/doc-docbook/TidyHTML-spec,v 1.4 2006/04/04 14:03:49 ph10 Exp $ |
168e428f PH |
4 | |
5 | # Script to tidy up the spec HTML files that are generated by xmlto. The | |
6 | # following changes are made: | |
7 | # | |
8 | # 1. Tidy the index.html file by splitting the very long lines. | |
9 | # 2. Create reverse links from chapter and section titles back to the TOC. | |
10 | # 3. Tidy the ix01.html file - the actual index - by splitting long lines. | |
11 | # 4. Insert links from the letter divisions to the top of the Index. | |
068aaea8 PH |
12 | # 5. Turn <div class="literallayout"><p> into <div class="literallayout"> and |
13 | # a matching </p></div> into </div> to get rid of unwanted vertical white | |
14 | # space. | |
15 | # 6. Before each occurrence of </td> insert so that the table's cell | |
16 | # is a little bit wider than the text itself. | |
168e428f | 17 | |
4f578862 | 18 | chdir "spec_html"; |
168e428f PH |
19 | |
20 | $tocref = 1; | |
21 | ||
22 | # Read in the index.html file. It's really the TOC. | |
23 | ||
24 | open(IN, "index.html") || die "Failed to open index.html for reading: $!\n"; | |
25 | @toc = <IN>; | |
26 | close(IN); | |
27 | ||
4f578862 PH |
28 | # Insert a newline after every > except when it is preceded by 'class="quote"', |
29 | # because the whole toc is generated as one humungous line that is hard to | |
30 | # check. We have to avoid it in the quote case because that puts a space into | |
31 | # the output, and similarly for the </span> the comes afterwards. Easy way out | |
32 | # is just not to do it for all </span> occurrences. Unfortunately, Perl does | |
33 | # not implement lookbehinds where the alternatives are of different lengths, so | |
34 | # we have to take two passes. | |
35 | ||
36 | ||
37 | foreach $line (@toc) | |
38 | { | |
39 | $line =~ s/(?<!class="quote")>\s*/>\n/g; | |
40 | $line =~ s/<\/span>\n/<\/span>/g; | |
41 | } | |
42 | ||
43 | # Split the lines so that each one is a separate element in the vector. | |
168e428f | 44 | |
168e428f PH |
45 | for ($i = 0; $i < scalar(@toc); $i++) |
46 | { splice @toc, $i, 1, (split /(?<=\n)/, $toc[$i]); } | |
47 | ||
48 | # We want to create reverse links from each chapter and section title back to | |
49 | # the relevant place in the TOC. Scan the TOC for the relevant entries. Add | |
50 | # an id to each entry, and create tables that remember the file names and the | |
51 | # new link ids. | |
52 | ||
53 | foreach $line (@toc) | |
54 | { | |
55 | if ($line =~ /^<a href="((?:ch|ix)\d+\.html)(#[^"]+)?">/) | |
56 | { | |
57 | my($chix) = $1; | |
58 | my($ss) = $2; | |
59 | my($id) = sprintf "%04d", $tocref++; | |
60 | $line =~ s/<a/<a id="toc$id"/; | |
61 | $backref{"$chix$ss"} = "toc$id"; | |
62 | push @chlist, $chix; | |
63 | } | |
64 | } | |
65 | ||
66 | # Write out the modified index.html file. | |
67 | ||
68 | open (OUT, ">index.html") || die "Failed to open index.html for writing: $!\n"; | |
69 | print OUT @toc; | |
70 | close(OUT); | |
71 | ||
068aaea8 PH |
72 | # Now scan each of the other page files and insert the reverse links. While |
73 | # we are at it, we tidy up <div class="literallayout"> by removing unwanted | |
74 | # paragraph marks, which generate unwanted vertical space. We also insert | |
75 | # before </td> to push table cells apart from each other. | |
168e428f PH |
76 | |
77 | foreach $file (@chlist) | |
78 | { | |
79 | open(IN, "$file") || die "Failed to open $file for reading: $!\n"; | |
80 | @text = <IN>; | |
81 | close(IN); | |
82 | ||
068aaea8 PH |
83 | # Insert a newline after certain elements, and split the lines so that each |
84 | # one is a separate element in the vector. This makes it easier to recognize | |
85 | # these elements. | |
86 | ||
168e428f PH |
87 | foreach $line (@text) |
88 | { | |
068aaea8 PH |
89 | $line =~ s/<p>\s*(?!\n)/<p>\n/g; |
90 | $line =~ s/<\/p>\s*(?!\n)/<\/p>\n/g; | |
91 | $line =~ s/<\/div>\s*(?!\n)/<\/div>\n/g; | |
92 | $line =~ s/<div([^>]*)>(?!\n)/<div$1>\n/g; | |
93 | } | |
94 | ||
95 | for ($i = 0; $i < scalar(@text); $i++) | |
96 | { splice @text, $i, 1, (split /(?<=\n)/, $text[$i]); } | |
97 | ||
98 | $thisdiv = 0; | |
99 | ||
100 | for ($i = 0; $i < scalar(@text); $i++) | |
101 | { | |
102 | if ($text[$i] =~ /^(.*?)<a( xmlns="[^"]+")? id="([^"]+)"><\/a>(.+?)<\/h(.*)$/) | |
168e428f PH |
103 | { |
104 | my($pre, $opt, $id, $title, $post) = ($1, $2, $3, $4, $5); | |
105 | ||
106 | # Section reference | |
107 | my($ref) = $backref{"$file#$id"}; | |
108 | ||
109 | # If not found, try for a chapter reference | |
110 | $ref = $backref{"$file"} if !defined $ref; | |
111 | ||
112 | # Adjust the line | |
068aaea8 PH |
113 | $text[$i]= "$pre<a$opt href=\"index.html#$ref\" id=\"$id\">$title</a></h$post"; |
114 | } | |
115 | ||
9b371988 | 116 | elsif ($text[$i] =~ /^<div [^>]*?class="literallayout">$/ && $text[$i+1] eq "<p>\n") |
068aaea8 PH |
117 | { |
118 | $text[++$i] = ""; | |
119 | $thisdiv = 1; | |
120 | } | |
121 | elsif ($thisdiv && $text[$i] eq "</p>\n" && $text[$i+1] eq "</div>\n") | |
122 | { | |
123 | $text[$i] = ""; | |
124 | $thisdiv = 0; | |
125 | } | |
126 | elsif ($text[$i] =~ /^\s*<\/td>/) | |
127 | { | |
128 | $text[$i] = " $text[$i]"; | |
168e428f PH |
129 | } |
130 | } | |
131 | ||
132 | open(OUT, ">$file") || die "Failed to open $file for writing: $!\n"; | |
133 | print OUT @text; | |
134 | close(OUT); | |
135 | } | |
136 | ||
137 | # Now process the ix01.html file | |
138 | ||
139 | open(IN, "ix01.html") || die "Failed to open ix01.html for reading: $!\n"; | |
140 | @index = <IN>; | |
141 | close(IN); | |
142 | ||
143 | # Insert a newline after every > because the whole index is generated as one | |
144 | # humungous line that is hard to check. Then split the lines so that each one | |
145 | # is a separate element in the vector. | |
146 | ||
147 | foreach $line (@index) { $line =~ s/>\s*/>\n/g; } | |
148 | for ($i = 0; $i < scalar(@index); $i++) | |
149 | { splice @index, $i, 1, (split /(?<=\n)/, $index[$i]); } | |
150 | ||
151 | # We want to add a list of letters at the top of the index, and link back | |
152 | # to them from each letter heading. First find the index title and remember | |
153 | # where to insert the list of letters. | |
154 | ||
155 | for ($i = 0; $i < scalar(@index); $i++) | |
156 | { | |
157 | if ($index[$i] =~ /^<\/h2>$/) | |
158 | { | |
159 | $listindex = $i; | |
160 | last; | |
161 | } | |
162 | } | |
163 | ||
164 | # Now scan through for the letter headings and build the cross references, | |
165 | # while also building up the list to insert. | |
166 | ||
167 | $list = "<h4>\n"; | |
168 | for (; $i < scalar(@index); $i++) | |
169 | { | |
170 | if ($index[$i] =~ /^(.)<\/h3>$/) | |
171 | { | |
172 | $letter = $1; | |
173 | $index[$i-1] =~ s/^/<a id="${letter}B" href="#${letter}T">/; | |
174 | $index[$i] =~ s/$/<\/a>/; | |
175 | $list .= "<a id=\"${letter}T\" href=\"#${letter}B\"> $letter</a>\n"; | |
176 | } | |
177 | } | |
178 | ||
179 | # Now we know which letters we have, we can insert the list. | |
180 | ||
181 | $list .= "</h4>\n"; | |
182 | splice @index, $listindex, 0, $list; | |
183 | ||
184 | # Write out the modified index.html file. | |
185 | ||
186 | open (OUT, ">ix01.html") || die "Failed to open ix01.html for writing: $!\n"; | |
187 | print OUT @index; | |
188 | close(OUT); | |
189 | ||
190 | ||
191 | # End |