Commit | Line | Data |
---|---|---|
ddf5b37e TG |
1 | #!/bin/bash |
2 | ||
3 | # page-reformat: reformat the original pages of emailselfdefense.fsf.org | |
4 | ||
5 | # cd to the directory where the script is located, then: | |
6 | # $ ./page-reformat [path to the HTML file] | |
7 | # You can also drag-and-drop the file into the terminal. | |
8 | # The result has extension .html.html to avoid writing over the original page. | |
9 | ||
10 | # If you want to wrap the text, uncomment line 97 and comment out line 94. | |
11 | ||
12 | ||
13 | cp $1 tmp | |
14 | ||
15 | # Remove javascript, which shouldn't be reformated. | |
16 | sed -i '/jquery-1.11.0.min.js/,$d' tmp | |
17 | ||
18 | # Remove leading and trailing spaces/tabs. | |
19 | sed -i 's,\t, ,g' tmp | |
20 | sed -i 's,^ *,,' tmp | |
21 | sed -i 's, *$,,' tmp | |
22 | ||
23 | # Remove LF after </a>. | |
24 | sed -i '/<\/a>$/ {N; s,<\/a>\n\([^<]\),<\/a>\1,}' tmp | |
25 | ||
26 | # One string per paragraph, header or list item. | |
27 | for tag in li p strong a h3; do | |
28 | sed -i "/<$tag[^>]*>$/ {N; s,\\n, ,}" tmp | |
29 | done | |
30 | for tag in a strong; do | |
31 | sed -i "/<\\/$tag>$/ {N; s,\\n, ,}" tmp | |
32 | done | |
33 | # This command may need to be repeated. Adjust the number of repeats. This | |
34 | # could be done by looping back to a sed marker, but a while loop seems | |
35 | # quicker. | |
36 | i=0 | |
37 | while (( i < 2 )); do | |
38 | sed -i '/[^<>]$/ {N; s,\([^<>]\)\n,\1 ,}' tmp | |
39 | let i=i+1 | |
40 | done | |
41 | ||
42 | sed -i '/ \/>$/ {N; s,\( \/>\)\n,\1 ,}' tmp | |
43 | sed -i '/ <a[^>]*>$/ {N; s,\(<a[^>]*>\)\n\([^<]\),\1 \2,}' tmp | |
44 | ||
45 | # Make sure there is only one paragraph per string. This command may need to | |
46 | # be repeated. Adjust the number of repeats. | |
47 | i=0 | |
48 | while (( i < 2 )); do | |
49 | sed -i 's,</p>\(.\+\)$,</p>\n\1,' tmp | |
50 | let i=i+1 | |
51 | done | |
52 | ||
53 | # Single out the tags which include p (will also work for pre). | |
54 | sed -i 's,\(.\)<p,\1\n<p,' tmp | |
55 | ||
56 | # Single-out input meta and link. | |
57 | for tag in input meta link link; do | |
58 | sed -i "s,> <$tag,>\n<$tag," tmp | |
59 | done | |
60 | # Remove leading and trailing spaces, double spaces and blank lines. | |
61 | sed -i 's,^ *,,' tmp | |
62 | sed -i 's, *$,,' tmp | |
63 | sed -i 's, , ,g' tmp | |
64 | sed -i '/^$/d' tmp | |
65 | ||
66 | # Fuse comment with </p>. | |
67 | sed -i '/<\/p>$/ {N;s,\n\(<!-- [^~]\),\1,}' tmp | |
68 | ||
69 | # Separate truncated "~~~" comment from fused tag. | |
70 | sed -i 's,~~~[ ]\?[-]\?[-]\?[ ]\?<,~~~\n<,' tmp | |
71 | ||
72 | # Fuse header, section and footer with the corresponding div. | |
73 | for tag in header section footer; do | |
74 | sed -i "/^<$tag/ {N; s,\\(<$tag[^>]*>\\)\\n<div>,\\1<div>,}" tmp | |
75 | sed -i "/^<\\/div>$/ {N; s,<\\/div>\\n\\(<\\/$tag>\\),</div>\\1,}" tmp | |
76 | done | |
77 | ||
78 | # Add LF before main sections and commented-out parts. | |
79 | sed -i 's,<!-- ~~,\n<!-- ~~,' tmp | |
80 | sed -i '/COMMENTED OUT/ s,^,\n,' tmp | |
81 | ||
82 | # Make the text more readable. | |
83 | for tag in p h1 h2 h3 h4 dl title form; do | |
84 | sed -i "s,<$tag,\\n&," tmp | |
85 | done | |
86 | for tag in p dl ul h1 h2 h3 h4 title head footer form script; do | |
87 | sed -i "/<\\/$tag>/s,$,\\n," tmp | |
88 | done | |
89 | sed -i '/<\/dd>/ {N; s,</dd>\n<dt,</dd>\n\n<dt,}' tmp | |
90 | sed -i '/<\/dt>/ {N; s,</dt>\n<dd,</dt>\n\n<dd,}' tmp | |
91 | sed -i 's,</p></span>$,</p>\n</span>,' tmp | |
92 | ||
93 | sed -i 's, alt=,\nalt=,g' tmp | |
94 | sed -i 's, | , |\n,g' tmp | |
95 | mv tmp $1.html | |
96 | ||
97 | # Wrap the text. | |
98 | #fmt -s -w 95 tmp > $1.html | |
99 | ||
100 | # Remove extra LFs, if any. | |
101 | sed -i ':a /^$/ {N; s,\n$,,; ba}' $1.html | |
102 | sed -i ':a /^\n*$/ {$d; N; ba}' $1.html | |
103 | ||
104 | rm -f tmp |