Commit | Line | Data |
---|---|---|
168e428f PH |
1 | #! /usr/bin/perl |
2 | ||
f89d2485 | 3 | # $Cambridge: exim/doc/doc-docbook/Pre-xml,v 1.4 2007/04/11 15:26:09 ph10 Exp $ |
168e428f PH |
4 | |
5 | # Script to pre-process XML input before processing it for various purposes. | |
6 | # Options specify which transformations are to be done. Monospaced literal | |
7 | # layout blocks are never touched. | |
8 | ||
9 | # Changes: | |
10 | ||
9b371988 PH |
11 | # -ascii: Replace ’ by ' |
12 | # Replace © by (c) | |
13 | # Replace † by * | |
14 | # Replace ‡ by ** | |
15 | # Replace by a space | |
16 | # Replace – by - | |
168e428f | 17 | # Put quotes round <quote> text |
068aaea8 | 18 | # |
9b371988 PH |
19 | # -quoteliteral: |
20 | # Put quotes round <literal> text | |
21 | # | |
168e428f | 22 | # -bookinfo: Remove the <bookinfo> element from the file |
068aaea8 | 23 | # |
168e428f PH |
24 | # -fi: Replace "fi" by fi except when it is in an XML element, or |
25 | # inside a <literal>. | |
068aaea8 PH |
26 | # |
27 | # -html: Certain things are done only for HTML output: | |
28 | # | |
9b371988 | 29 | # If <literallayout> is followed by optional space and then a |
068aaea8 PH |
30 | # newline, the space and newline are removed, because otherwise you |
31 | # get a blank line in the HTML output. | |
32 | # | |
f89d2485 | 33 | # -noindex Remove the XML that generates indexes. |
168e428f | 34 | # -oneindex Ditto, but add XML to generate a single index. |
9b371988 PH |
35 | # |
36 | # -optbreak Insert an optional line break (zero-width space, ​) after | |
37 | # every underscore in text within <option> and <variable> elements, | |
38 | # except when preceded by <entry> (i.e. not in tables). The same is | |
39 | # also done within a word of four or more upper-case letters (for | |
40 | # compile-time options). | |
168e428f PH |
41 | |
42 | ||
43 | ||
9b371988 | 44 | # The function that processes non-literal, non-monospaced text |
168e428f PH |
45 | |
46 | sub process() | |
47 | { | |
48 | my($s) = $_[0]; | |
49 | ||
50 | $s =~ s/fi(?![^<>]*>)/fi/g if $ligatures; | |
51 | ||
9b371988 PH |
52 | if ($optbreak) |
53 | { | |
54 | $s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)% | |
55 | my($x,$y) = ($1,$2); $y =~ s/_/_​/g; "$x"."$y"%gex; | |
56 | ||
57 | $s =~ s?\b([A-Z_]{4,})\b? | |
58 | my($x) = $1; $x =~ s/_/_​/g; "$x"?gex; | |
59 | } | |
60 | ||
168e428f PH |
61 | if ($ascii) |
62 | { | |
168e428f | 63 | $s =~ s/’/'/g; |
9b371988 PH |
64 | $s =~ s/©/(c)/g; |
65 | $s =~ s/†/*/g; | |
66 | $s =~ s/‡/**/g; | |
67 | $s =~ s/&nsbp;/ /g; | |
68 | $s =~ s/–/-/g; | |
168e428f PH |
69 | $s =~ s/<quote>/"/g; |
70 | $s =~ s/<\/quote>/"/g; | |
71 | } | |
72 | ||
73 | $s; | |
74 | } | |
75 | ||
76 | ||
77 | # The main program | |
78 | ||
168e428f PH |
79 | $ascii = 0; |
80 | $bookinfo = 0; | |
068aaea8 | 81 | $html = 0; |
168e428f | 82 | $inliteral = 0; |
068aaea8 | 83 | $inliterallayout = 0; |
168e428f PH |
84 | $ligatures = 0; |
85 | $madeindex = 0; | |
86 | $noindex = 0; | |
87 | $oneindex = 0; | |
9b371988 PH |
88 | $optbreak = 0; |
89 | $quoteliteral = 0; | |
168e428f PH |
90 | |
91 | foreach $arg (@ARGV) | |
92 | { | |
93 | if ($arg eq "-fi") { $ligatures = 1; } | |
168e428f PH |
94 | elsif ($arg eq "-ascii") { $ascii = 1; } |
95 | elsif ($arg eq "-bookinfo") { $bookinfo = 1; } | |
068aaea8 | 96 | elsif ($arg eq "-html") { $html = 1; } |
168e428f PH |
97 | elsif ($arg eq "-noindex") { $noindex = 1; } |
98 | elsif ($arg eq "-oneindex") { $oneindex = 1; } | |
9b371988 PH |
99 | elsif ($arg eq "-optbreak") { $optbreak = 1; } |
100 | elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; } | |
168e428f PH |
101 | else { die "** Pre-xml: Unknown option \"$arg\"\n"; } |
102 | } | |
103 | ||
104 | while (<STDIN>) | |
105 | { | |
168e428f PH |
106 | # Remove <bookinfo> if required |
107 | ||
108 | if ($bookinfo && /^<bookinfo/) | |
109 | { | |
110 | while (<STDIN>) { last if /^<\/bookinfo/; } | |
111 | next; | |
112 | } | |
113 | ||
114 | # Copy monospaced literallayout blocks | |
115 | ||
116 | if (/^<literallayout class="monospaced">/) | |
117 | { | |
068aaea8 | 118 | $_ = substr($_, 0, -1) if $html; |
168e428f PH |
119 | print; |
120 | while (<STDIN>) | |
121 | { | |
122 | print; | |
123 | last if /^<\/literallayout>/; | |
124 | } | |
125 | next; | |
126 | } | |
127 | ||
128 | # Adjust index-generation code if required | |
129 | ||
130 | if (($noindex || $oneindex) && /^<index[\s>]/) | |
131 | { | |
132 | while (<STDIN>) | |
133 | { | |
134 | last if /^<\/index>/; | |
135 | } | |
136 | ||
137 | if ($oneindex && !$madeindex) | |
138 | { | |
139 | $madeindex = 1; | |
140 | print "<index><title>Index</title></index>\n"; | |
141 | } | |
142 | ||
143 | next; | |
144 | } | |
145 | ||
146 | # A line that is not in a monospaced literal block; keep track of which | |
147 | # parts are in <literal> and which not. The latter get processed by the | |
068aaea8 PH |
148 | # function above. Items in <literal> get quoted unless they are also in |
149 | # a <literallayout> block, or are already being quoted. | |
168e428f PH |
150 | |
151 | for (;;) | |
152 | { | |
068aaea8 PH |
153 | $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/; |
154 | $inliterallayout = 1 if /^<literallayout/; | |
155 | $inliterallayout = 0 if /^<\/literallayout/; | |
156 | ||
168e428f PH |
157 | if ($inliteral) |
158 | { | |
068aaea8 | 159 | if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/) |
168e428f PH |
160 | { |
161 | print $1; | |
9b371988 | 162 | print "\"" if $quoteliteral && !$inliterallayout; |
168e428f PH |
163 | print "</literal>"; |
164 | $inliteral = 0; | |
165 | $_ = "$2\n"; | |
166 | } | |
167 | else | |
168 | { | |
169 | print; | |
170 | last; | |
171 | } | |
172 | } | |
173 | ||
174 | # Not in literal state | |
175 | ||
176 | else | |
177 | { | |
068aaea8 | 178 | if (/^(.*?)(?<!<quote>)<literal>(.*)$/) |
168e428f PH |
179 | { |
180 | print &process($1); | |
181 | print "<literal>"; | |
9b371988 | 182 | print "\"" if $quoteliteral && !$inliterallayout; |
168e428f PH |
183 | $inliteral = 1; |
184 | $_ = "$2\n"; | |
185 | } | |
186 | else | |
187 | { | |
188 | print &process($_); | |
189 | last; | |
190 | } | |
191 | } | |
192 | } # Loop for different parts of one line | |
193 | } # Loop for multiple lines | |
194 | ||
195 | # End |