| 1 | #!/bin/bash |
| 2 | |
| 3 | # page-reformat: reformat the original pages of emailselfdefense.fsf.org |
| 4 | |
| 5 | # cd to the directory where the script is located, then: |
| 6 | # $ ./page-reformat [path to the HTML file] |
| 7 | # You can also drag-and-drop the file into the terminal. |
| 8 | # The result has extension .html.html to avoid writing over the original page. |
| 9 | |
| 10 | # If you want to wrap the text, uncomment line 97 and comment out line 94. |
| 11 | |
| 12 | |
| 13 | cp $1 tmp |
| 14 | |
| 15 | # Remove javascript, which shouldn't be reformated. |
| 16 | sed -i '/jquery-1.11.0.min.js/,$d' tmp |
| 17 | |
| 18 | # Remove leading and trailing spaces/tabs. |
| 19 | sed -i 's,\t, ,g' tmp |
| 20 | sed -i 's,^ *,,' tmp |
| 21 | sed -i 's, *$,,' tmp |
| 22 | |
| 23 | # Remove LF after </a>. |
| 24 | sed -i '/<\/a>$/ {N; s,<\/a>\n\([^<]\),<\/a>\1,}' tmp |
| 25 | |
| 26 | # One string per paragraph, header or list item. |
| 27 | for tag in li p strong a h3; do |
| 28 | sed -i "/<$tag[^>]*>$/ {N; s,\\n, ,}" tmp |
| 29 | done |
| 30 | for tag in a strong; do |
| 31 | sed -i "/<\\/$tag>$/ {N; s,\\n, ,}" tmp |
| 32 | done |
| 33 | # This command may need to be repeated. Adjust the number of repeats. This |
| 34 | # could be done by looping back to a sed marker, but a while loop seems |
| 35 | # quicker. |
| 36 | i=0 |
| 37 | while (( i < 2 )); do |
| 38 | sed -i '/[^<>]$/ {N; s,\([^<>]\)\n,\1 ,}' tmp |
| 39 | let i=i+1 |
| 40 | done |
| 41 | |
| 42 | sed -i '/ \/>$/ {N; s,\( \/>\)\n,\1 ,}' tmp |
| 43 | sed -i '/ <a[^>]*>$/ {N; s,\(<a[^>]*>\)\n\([^<]\),\1 \2,}' tmp |
| 44 | |
| 45 | # Make sure there is only one paragraph per string. This command may need to |
| 46 | # be repeated. Adjust the number of repeats. |
| 47 | i=0 |
| 48 | while (( i < 2 )); do |
| 49 | sed -i 's,</p>\(.\+\)$,</p>\n\1,' tmp |
| 50 | let i=i+1 |
| 51 | done |
| 52 | |
| 53 | # Single out the tags which include p (will also work for pre). |
| 54 | sed -i 's,\(.\)<p,\1\n<p,' tmp |
| 55 | |
| 56 | # Single-out input meta and link. |
| 57 | for tag in input meta link link; do |
| 58 | sed -i "s,> <$tag,>\n<$tag," tmp |
| 59 | done |
| 60 | # Remove leading and trailing spaces, double spaces and blank lines. |
| 61 | sed -i 's,^ *,,' tmp |
| 62 | sed -i 's, *$,,' tmp |
| 63 | sed -i 's, , ,g' tmp |
| 64 | sed -i '/^$/d' tmp |
| 65 | |
| 66 | # Fuse comment with </p>. |
| 67 | sed -i '/<\/p>$/ {N;s,\n\(<!-- [^~]\),\1,}' tmp |
| 68 | |
| 69 | # Separate truncated "~~~" comment from fused tag. |
| 70 | sed -i 's,~~~[ ]\?[-]\?[-]\?[ ]\?<,~~~\n<,' tmp |
| 71 | |
| 72 | # Fuse header, section and footer with the corresponding div. |
| 73 | for tag in header section footer; do |
| 74 | sed -i "/^<$tag/ {N; s,\\(<$tag[^>]*>\\)\\n<div>,\\1<div>,}" tmp |
| 75 | sed -i "/^<\\/div>$/ {N; s,<\\/div>\\n\\(<\\/$tag>\\),</div>\\1,}" tmp |
| 76 | done |
| 77 | |
| 78 | # Add LF before main sections and commented-out parts. |
| 79 | sed -i 's,<!-- ~~,\n<!-- ~~,' tmp |
| 80 | sed -i '/COMMENTED OUT/ s,^,\n,' tmp |
| 81 | |
| 82 | # Make the text more readable. |
| 83 | for tag in p h1 h2 h3 h4 dl title form; do |
| 84 | sed -i "s,<$tag,\\n&," tmp |
| 85 | done |
| 86 | for tag in p dl ul h1 h2 h3 h4 title head footer form script; do |
| 87 | sed -i "/<\\/$tag>/s,$,\\n," tmp |
| 88 | done |
| 89 | sed -i '/<\/dd>/ {N; s,</dd>\n<dt,</dd>\n\n<dt,}' tmp |
| 90 | sed -i '/<\/dt>/ {N; s,</dt>\n<dd,</dt>\n\n<dd,}' tmp |
| 91 | sed -i 's,</p></span>$,</p>\n</span>,' tmp |
| 92 | |
| 93 | sed -i 's, alt=,\nalt=,g' tmp |
| 94 | sed -i 's, | , |\n,g' tmp |
| 95 | mv tmp $1.html |
| 96 | |
| 97 | # Wrap the text. |
| 98 | #fmt -s -w 95 tmp > $1.html |
| 99 | |
| 100 | # Remove extra LFs, if any. |
| 101 | sed -i ':a /^$/ {N; s,\n$,,; ba}' $1.html |
| 102 | sed -i ':a /^\n*$/ {$d; N; ba}' $1.html |
| 103 | |
| 104 | rm -f tmp |