From: Josh Roesslein <jroesslein@gmail.com>
Date: Mon, 12 Oct 2009 05:13:26 +0000 (-0500)
Subject: Fix parsing of search result 'source' attribute. Properly unescape html.
X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=0a954c9f7705417757e8d58ce12469453f7e3099;p=tweepy.git

Fix parsing of search result 'source' attribute. Properly unescape html.
---

diff --git a/CHANGES b/CHANGES
index be4d16d..4c0ae70 100644
--- a/CHANGES
+++ b/CHANGES
@@ -13,6 +13,7 @@ during upgrade will be listed here.
     + API.friends_ids and API.followers_ids now return a list of integers.
       Parser updated to handle cursor responses. See above.
     + Fix Status.source_url parsing
+    + Fix search result 'source' parsing to properly unescape html and extract source
 + Cursor
     Added the Cursor object to help with pagination within the API.
     Please see the pagination tutorial for more details.
diff --git a/tweepy/parsers.py b/tweepy/parsers.py
index 152de4c..84fcf53 100644
--- a/tweepy/parsers.py
+++ b/tweepy/parsers.py
@@ -2,6 +2,8 @@
 # Copyright 2009 Joshua Roesslein
 # See LICENSE
 
+import htmlentitydefs
+import re
 from datetime import datetime
 
 from . models import models
@@ -40,6 +42,28 @@ def _parse_search_datetime(str):
     return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000')
 
 
+def unescape_html(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
+
 def _parse_html_value(html):
 
     return html[html.find('>')+1:html.rfind('<')]
@@ -207,6 +231,8 @@ def _parse_search_result(obj, api):
     for k, v in obj.items():
         if k == 'created_at':
             setattr(result, k, _parse_search_datetime(v))
+        elif k == 'source':
+            setattr(result, k, _parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result