Fix parsing of search result 'source' attribute. Properly unescape html.
authorJosh Roesslein <jroesslein@gmail.com>
Mon, 12 Oct 2009 05:13:26 +0000 (00:13 -0500)
committerJosh Roesslein <jroesslein@gmail.com>
Mon, 12 Oct 2009 05:17:23 +0000 (00:17 -0500)
CHANGES
tweepy/parsers.py

diff --git a/CHANGES b/CHANGES
index be4d16d801141a4754865059e3cc9b2572113dfc..4c0ae707c7fcf38a5fc71903354730c1a5c41903 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -13,6 +13,7 @@ during upgrade will be listed here.
     + API.friends_ids and API.followers_ids now return a list of integers.
       Parser updated to handle cursor responses. See above.
     + Fix Status.source_url parsing
+    + Fix search result 'source' parsing to properly unescape html and extract source
 + Cursor
     Added the Cursor object to help with pagination within the API.
     Please see the pagination tutorial for more details.
index 152de4c03c5648e7a81812a5e436e0afa9153f74..84fcf53a023c3f0e0d8efd24e5a6e1c415fa1d6f 100644 (file)
@@ -2,6 +2,8 @@
 # Copyright 2009 Joshua Roesslein
 # See LICENSE
 
+import htmlentitydefs
+import re
 from datetime import datetime
 
 from . models import models
@@ -40,6 +42,28 @@ def _parse_search_datetime(str):
     return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000')
 
 
+def unescape_html(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
+
 def _parse_html_value(html):
 
     return html[html.find('>')+1:html.rfind('<')]
@@ -207,6 +231,8 @@ def _parse_search_result(obj, api):
     for k, v in obj.items():
         if k == 'created_at':
             setattr(result, k, _parse_search_datetime(v))
+        elif k == 'source':
+            setattr(result, k, _parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result