From 010d28b4f0a39103949692209106a1b47fceeaf2 Mon Sep 17 00:00:00 2001
From: Joar Wandborg <git@wandborg.com>
Date: Thu, 2 Feb 2012 21:28:21 +0100
Subject: [PATCH] ASCII art support - Fixes

- Improved(?) character set detection, chardet will not win
  over UTF-8 unless it is >= 90% sure.
- Changed the unicode.txt to ascii-portable.txt, since there is
  no unicode in the file.
- etc.
---
 mediagoblin/media_types/ascii/asciitoimage.py | 26 +++++--------------
 mediagoblin/media_types/ascii/processing.py   | 22 +++++++++++++---
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/mediagoblin/media_types/ascii/asciitoimage.py b/mediagoblin/media_types/ascii/asciitoimage.py
index da1a3bcc..186d8066 100644
--- a/mediagoblin/media_types/ascii/asciitoimage.py
+++ b/mediagoblin/media_types/ascii/asciitoimage.py
@@ -65,7 +65,8 @@ class AsciiToImage(object):
 
         self._if = ImageFont.truetype(
             self._font,
-            self._font_size)
+            self._font_size,
+            encoding='unic')
 
         #      ,-,-^-'-^'^-^'^-'^-.
         #     ( I am a wall socket )Oo,  ___
@@ -91,6 +92,9 @@ class AsciiToImage(object):
         - Character set detection and decoding,
           http://pypi.python.org/pypi/chardet
         '''
+        # Convert the input from str to unicode
+        text = text.decode('utf-8')
+
         # TODO: Account for alternative line endings
         lines = text.split('\n')
 
@@ -123,7 +127,7 @@ class AsciiToImage(object):
 
                 px_pos = self._px_pos(char_pos)
 
-                _log.debug('Writing character "{0}" at {1} (px pos {2}'.format(
+                _log.debug('Writing character "{0}" at {1} (px pos {2})'.format(
                         char,
                         char_pos,
                         px_pos))
@@ -152,21 +156,3 @@ class AsciiToImage(object):
                 px_pos[index] = char_pos[index] * self._if_dims[index]
 
         return px_pos
-
-
-if __name__ == "__main__":
-    import urllib
-    txt = urllib.urlopen('file:///home/joar/Dropbox/ascii/install-all-the-dependencies.txt')
-
-    _log.setLevel(logging.DEBUG)
-    logging.basicConfig()
-
-    converter = AsciiToImage()
-
-    converter.convert(txt.read(), '/tmp/test.png')
-
-    '''
-    im, x, y, duration = renderImage(h, 10)
-    print "Rendered image in %.5f seconds" % duration
-    im.save('tldr.png', "PNG")
-    '''
diff --git a/mediagoblin/media_types/ascii/processing.py b/mediagoblin/media_types/ascii/processing.py
index ec530df6..96dfce80 100644
--- a/mediagoblin/media_types/ascii/processing.py
+++ b/mediagoblin/media_types/ascii/processing.py
@@ -17,10 +17,12 @@ import asciitoimage
 import chardet
 import os
 import Image
+import logging
 
 from mediagoblin import mg_globals as mgg
 from mediagoblin.processing import create_pub_filepath, THUMB_SIZE
 
+_log = logging.getLogger(__name__)
 
 def process_ascii(entry):
     '''
@@ -42,6 +44,17 @@ def process_ascii(entry):
     with queued_file:
         queued_file_charset = chardet.detect(queued_file.read())
 
+        # Only select a non-utf-8 charset if chardet is *really* sure
+        # Tested with "Feli\x0109an superjaron", which was detecte
+        if queued_file_charset['confidence'] < 0.9:
+            interpreted_charset = 'utf-8'
+        else:
+            interpreted_charset = queued_file_charset['encoding']
+
+        _log.info('Charset detected: {0}\nWill interpret as: {1}'.format(
+                queued_file_charset,
+                interpreted_charset))
+
         queued_file.seek(0)  # Rewind the queued file
 
         thumb_filepath = create_pub_filepath(
@@ -73,13 +86,16 @@ def process_ascii(entry):
 
         queued_file.seek(0)  # Rewind *again*
 
-        unicode_filepath = create_pub_filepath(entry, 'unicode.txt')
+        unicode_filepath = create_pub_filepath(entry, 'ascii-portable.txt')
 
         with mgg.public_store.get_file(unicode_filepath, 'wb') \
                 as unicode_file:
+            # Decode the original file from its detected charset (or UTF8)
+            # Encode the unicode instance to ASCII and replace any non-ASCII
+            # with an HTML entity (&#
             unicode_file.write(
-                    unicode(queued_file.read().decode(
-                        queued_file_charset['encoding'])).encode(
+                unicode(queued_file.read().decode(
+                        interpreted_charset)).encode(
                     'ascii',
                     'xmlcharrefreplace'))
 
-- 
2.25.1