Commit | Line | Data |
---|---|---|
168e428f PH |
1 | #! /usr/bin/perl |
2 | ||
168e428f PH |
3 | # Script to pre-process XML input before processing it for various purposes. |
4 | # Options specify which transformations are to be done. Monospaced literal | |
5 | # layout blocks are never touched. | |
6 | ||
7 | # Changes: | |
8 | ||
9b371988 PH |
9 | # -ascii: Replace ’ by ' |
10 | # Replace © by (c) | |
11 | # Replace † by * | |
12 | # Replace ‡ by ** | |
13 | # Replace by a space | |
14 | # Replace – by - | |
168e428f | 15 | # Put quotes round <quote> text |
068aaea8 | 16 | # |
9b371988 PH |
17 | # -quoteliteral: |
18 | # Put quotes round <literal> text | |
19 | # | |
168e428f | 20 | # -bookinfo: Remove the <bookinfo> element from the file |
068aaea8 | 21 | # |
168e428f PH |
22 | # -fi: Replace "fi" by fi except when it is in an XML element, or |
23 | # inside a <literal>. | |
068aaea8 PH |
24 | # |
25 | # -html: Certain things are done only for HTML output: | |
26 | # | |
9b371988 | 27 | # If <literallayout> is followed by optional space and then a |
068aaea8 PH |
28 | # newline, the space and newline are removed, because otherwise you |
29 | # get a blank line in the HTML output. | |
30 | # | |
f89d2485 | 31 | # -noindex Remove the XML that generates indexes. |
168e428f | 32 | # -oneindex Ditto, but add XML to generate a single index. |
9b371988 PH |
33 | # |
34 | # -optbreak Insert an optional line break (zero-width space, ​) after | |
35 | # every underscore in text within <option> and <variable> elements, | |
36 | # except when preceded by <entry> (i.e. not in tables). The same is | |
37 | # also done within a word of four or more upper-case letters (for | |
38 | # compile-time options). | |
168e428f PH |
39 | |
40 | ||
41 | ||
9b371988 | 42 | # The function that processes non-literal, non-monospaced text |
168e428f PH |
43 | |
44 | sub process() | |
45 | { | |
46 | my($s) = $_[0]; | |
47 | ||
48 | $s =~ s/fi(?![^<>]*>)/fi/g if $ligatures; | |
49 | ||
9b371988 PH |
50 | if ($optbreak) |
51 | { | |
52 | $s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)% | |
53 | my($x,$y) = ($1,$2); $y =~ s/_/_​/g; "$x"."$y"%gex; | |
54 | ||
55 | $s =~ s?\b([A-Z_]{4,})\b? | |
56 | my($x) = $1; $x =~ s/_/_​/g; "$x"?gex; | |
57 | } | |
58 | ||
168e428f PH |
59 | if ($ascii) |
60 | { | |
168e428f | 61 | $s =~ s/’/'/g; |
9b371988 PH |
62 | $s =~ s/©/(c)/g; |
63 | $s =~ s/†/*/g; | |
64 | $s =~ s/‡/**/g; | |
65 | $s =~ s/&nsbp;/ /g; | |
66 | $s =~ s/–/-/g; | |
168e428f PH |
67 | $s =~ s/<quote>/"/g; |
68 | $s =~ s/<\/quote>/"/g; | |
69 | } | |
70 | ||
71 | $s; | |
72 | } | |
73 | ||
74 | ||
75 | # The main program | |
76 | ||
168e428f PH |
77 | $ascii = 0; |
78 | $bookinfo = 0; | |
068aaea8 | 79 | $html = 0; |
168e428f | 80 | $inliteral = 0; |
068aaea8 | 81 | $inliterallayout = 0; |
168e428f PH |
82 | $ligatures = 0; |
83 | $madeindex = 0; | |
84 | $noindex = 0; | |
85 | $oneindex = 0; | |
9b371988 PH |
86 | $optbreak = 0; |
87 | $quoteliteral = 0; | |
168e428f PH |
88 | |
89 | foreach $arg (@ARGV) | |
90 | { | |
91 | if ($arg eq "-fi") { $ligatures = 1; } | |
168e428f PH |
92 | elsif ($arg eq "-ascii") { $ascii = 1; } |
93 | elsif ($arg eq "-bookinfo") { $bookinfo = 1; } | |
068aaea8 | 94 | elsif ($arg eq "-html") { $html = 1; } |
168e428f PH |
95 | elsif ($arg eq "-noindex") { $noindex = 1; } |
96 | elsif ($arg eq "-oneindex") { $oneindex = 1; } | |
9b371988 PH |
97 | elsif ($arg eq "-optbreak") { $optbreak = 1; } |
98 | elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; } | |
168e428f PH |
99 | else { die "** Pre-xml: Unknown option \"$arg\"\n"; } |
100 | } | |
101 | ||
102 | while (<STDIN>) | |
103 | { | |
168e428f PH |
104 | # Remove <bookinfo> if required |
105 | ||
106 | if ($bookinfo && /^<bookinfo/) | |
107 | { | |
108 | while (<STDIN>) { last if /^<\/bookinfo/; } | |
109 | next; | |
110 | } | |
111 | ||
112 | # Copy monospaced literallayout blocks | |
113 | ||
114 | if (/^<literallayout class="monospaced">/) | |
115 | { | |
068aaea8 | 116 | $_ = substr($_, 0, -1) if $html; |
168e428f PH |
117 | print; |
118 | while (<STDIN>) | |
119 | { | |
120 | print; | |
121 | last if /^<\/literallayout>/; | |
122 | } | |
123 | next; | |
124 | } | |
125 | ||
126 | # Adjust index-generation code if required | |
127 | ||
128 | if (($noindex || $oneindex) && /^<index[\s>]/) | |
129 | { | |
130 | while (<STDIN>) | |
131 | { | |
132 | last if /^<\/index>/; | |
133 | } | |
134 | ||
135 | if ($oneindex && !$madeindex) | |
136 | { | |
137 | $madeindex = 1; | |
138 | print "<index><title>Index</title></index>\n"; | |
139 | } | |
140 | ||
141 | next; | |
142 | } | |
143 | ||
144 | # A line that is not in a monospaced literal block; keep track of which | |
145 | # parts are in <literal> and which not. The latter get processed by the | |
068aaea8 PH |
146 | # function above. Items in <literal> get quoted unless they are also in |
147 | # a <literallayout> block, or are already being quoted. | |
168e428f PH |
148 | |
149 | for (;;) | |
150 | { | |
068aaea8 PH |
151 | $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/; |
152 | $inliterallayout = 1 if /^<literallayout/; | |
153 | $inliterallayout = 0 if /^<\/literallayout/; | |
154 | ||
168e428f PH |
155 | if ($inliteral) |
156 | { | |
068aaea8 | 157 | if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/) |
168e428f PH |
158 | { |
159 | print $1; | |
9b371988 | 160 | print "\"" if $quoteliteral && !$inliterallayout; |
168e428f PH |
161 | print "</literal>"; |
162 | $inliteral = 0; | |
163 | $_ = "$2\n"; | |
164 | } | |
165 | else | |
166 | { | |
167 | print; | |
168 | last; | |
169 | } | |
170 | } | |
171 | ||
172 | # Not in literal state | |
173 | ||
174 | else | |
175 | { | |
068aaea8 | 176 | if (/^(.*?)(?<!<quote>)<literal>(.*)$/) |
168e428f PH |
177 | { |
178 | print &process($1); | |
179 | print "<literal>"; | |
9b371988 | 180 | print "\"" if $quoteliteral && !$inliterallayout; |
168e428f PH |
181 | $inliteral = 1; |
182 | $_ = "$2\n"; | |
183 | } | |
184 | else | |
185 | { | |
186 | print &process($_); | |
187 | last; | |
188 | } | |
189 | } | |
190 | } # Loop for different parts of one line | |
191 | } # Loop for multiple lines | |
192 | ||
193 | # End |