From 010d28b4f0a39103949692209106a1b47fceeaf2 Mon Sep 17 00:00:00 2001 From: Joar Wandborg Date: Thu, 2 Feb 2012 21:28:21 +0100 Subject: [PATCH] ASCII art support - Fixes - Improved(?) character set detection, chardet will not win over UTF-8 unless it is >= 90% sure. - Changed the unicode.txt to ascii-portable.txt, since there is no unicode in the file. - etc. --- mediagoblin/media_types/ascii/asciitoimage.py | 26 +++++-------------- mediagoblin/media_types/ascii/processing.py | 22 +++++++++++++--- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/mediagoblin/media_types/ascii/asciitoimage.py b/mediagoblin/media_types/ascii/asciitoimage.py index da1a3bcc..186d8066 100644 --- a/mediagoblin/media_types/ascii/asciitoimage.py +++ b/mediagoblin/media_types/ascii/asciitoimage.py @@ -65,7 +65,8 @@ class AsciiToImage(object): self._if = ImageFont.truetype( self._font, - self._font_size) + self._font_size, + encoding='unic') # ,-,-^-'-^'^-^'^-'^-. # ( I am a wall socket )Oo, ___ @@ -91,6 +92,9 @@ class AsciiToImage(object): - Character set detection and decoding, http://pypi.python.org/pypi/chardet ''' + # Convert the input from str to unicode + text = text.decode('utf-8') + # TODO: Account for alternative line endings lines = text.split('\n') @@ -123,7 +127,7 @@ class AsciiToImage(object): px_pos = self._px_pos(char_pos) - _log.debug('Writing character "{0}" at {1} (px pos {2}'.format( + _log.debug('Writing character "{0}" at {1} (px pos {2})'.format( char, char_pos, px_pos)) @@ -152,21 +156,3 @@ class AsciiToImage(object): px_pos[index] = char_pos[index] * self._if_dims[index] return px_pos - - -if __name__ == "__main__": - import urllib - txt = urllib.urlopen('file:///home/joar/Dropbox/ascii/install-all-the-dependencies.txt') - - _log.setLevel(logging.DEBUG) - logging.basicConfig() - - converter = AsciiToImage() - - converter.convert(txt.read(), '/tmp/test.png') - - ''' - im, x, y, duration = renderImage(h, 10) - print "Rendered image in %.5f seconds" % duration - im.save('tldr.png', "PNG") - ''' diff --git a/mediagoblin/media_types/ascii/processing.py b/mediagoblin/media_types/ascii/processing.py index ec530df6..96dfce80 100644 --- a/mediagoblin/media_types/ascii/processing.py +++ b/mediagoblin/media_types/ascii/processing.py @@ -17,10 +17,12 @@ import asciitoimage import chardet import os import Image +import logging from mediagoblin import mg_globals as mgg from mediagoblin.processing import create_pub_filepath, THUMB_SIZE +_log = logging.getLogger(__name__) def process_ascii(entry): ''' @@ -42,6 +44,17 @@ def process_ascii(entry): with queued_file: queued_file_charset = chardet.detect(queued_file.read()) + # Only select a non-utf-8 charset if chardet is *really* sure + # Tested with "Feli\x0109an superjaron", which was detecte + if queued_file_charset['confidence'] < 0.9: + interpreted_charset = 'utf-8' + else: + interpreted_charset = queued_file_charset['encoding'] + + _log.info('Charset detected: {0}\nWill interpret as: {1}'.format( + queued_file_charset, + interpreted_charset)) + queued_file.seek(0) # Rewind the queued file thumb_filepath = create_pub_filepath( @@ -73,13 +86,16 @@ def process_ascii(entry): queued_file.seek(0) # Rewind *again* - unicode_filepath = create_pub_filepath(entry, 'unicode.txt') + unicode_filepath = create_pub_filepath(entry, 'ascii-portable.txt') with mgg.public_store.get_file(unicode_filepath, 'wb') \ as unicode_file: + # Decode the original file from its detected charset (or UTF8) + # Encode the unicode instance to ASCII and replace any non-ASCII + # with an HTML entity (&# unicode_file.write( - unicode(queued_file.read().decode( - queued_file_charset['encoding'])).encode( + unicode(queued_file.read().decode( + interpreted_charset)).encode( 'ascii', 'xmlcharrefreplace')) -- 2.25.1