From ddf5b37e218dd2b8e923a95aeab444f34283f88f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Th=C3=A9r=C3=A8se=20Godefroy?= Date: Wed, 23 Jul 2014 18:31:28 +0200 Subject: [PATCH] fr: script to reformat the HTML pages (remove indentation, single out paragraphs, etc. --- fr/kitchen/page-reformat | 104 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100755 fr/kitchen/page-reformat diff --git a/fr/kitchen/page-reformat b/fr/kitchen/page-reformat new file mode 100755 index 0000000..cb2caf9 --- /dev/null +++ b/fr/kitchen/page-reformat @@ -0,0 +1,104 @@ +#!/bin/bash + +# page-reformat: reformat the original pages of emailselfdefense.fsf.org + +# cd to the directory where the script is located, then: +# $ ./page-reformat [path to the HTML file] +# You can also drag-and-drop the file into the terminal. +# The result has extension .html.html to avoid writing over the original page. + +# If you want to wrap the text, uncomment line 97 and comment out line 94. + + +cp $1 tmp + +# Remove javascript, which shouldn't be reformated. +sed -i '/jquery-1.11.0.min.js/,$d' tmp + +# Remove leading and trailing spaces/tabs. +sed -i 's,\t, ,g' tmp +sed -i 's,^ *,,' tmp +sed -i 's, *$,,' tmp + +# Remove LF after . +sed -i '/<\/a>$/ {N; s,<\/a>\n\([^<]\),<\/a>\1,}' tmp + +# One string per paragraph, header or list item. +for tag in li p strong a h3; do + sed -i "/<$tag[^>]*>$/ {N; s,\\n, ,}" tmp +done +for tag in a strong; do + sed -i "/<\\/$tag>$/ {N; s,\\n, ,}" tmp +done +# This command may need to be repeated. Adjust the number of repeats. This +# could be done by looping back to a sed marker, but a while loop seems +# quicker. +i=0 +while (( i < 2 )); do + sed -i '/[^<>]$/ {N; s,\([^<>]\)\n,\1 ,}' tmp + let i=i+1 +done + +sed -i '/ \/>$/ {N; s,\( \/>\)\n,\1 ,}' tmp +sed -i '/ ]*>$/ {N; s,\(]*>\)\n\([^<]\),\1 \2,}' tmp + +# Make sure there is only one paragraph per string. This command may need to +# be repeated. Adjust the number of repeats. +i=0 +while (( i < 2 )); do + sed -i 's,

\(.\+\)$,

\n\1,' tmp + let i=i+1 +done + +# Single out the tags which include p (will also work for pre). +sed -i 's,\(.\) <$tag,>\n<$tag," tmp +done +# Remove leading and trailing spaces, double spaces and blank lines. +sed -i 's,^ *,,' tmp +sed -i 's, *$,,' tmp +sed -i 's, , ,g' tmp +sed -i '/^$/d' tmp + +# Fuse comment with

. +sed -i '/<\/p>$/ {N;s,\n\(