Fix parsing of search result 'source' attribute. Properly unescape html.

author Josh Roesslein <jroesslein@gmail.com>

Mon, 12 Oct 2009 05:13:26 +0000 (00:13 -0500)

committer Josh Roesslein <jroesslein@gmail.com>

Mon, 12 Oct 2009 05:17:23 +0000 (00:17 -0500)
author Josh Roesslein <jroesslein@gmail.com>
Mon, 12 Oct 2009 05:13:26 +0000 (00:13 -0500)
committer Josh Roesslein <jroesslein@gmail.com>
Mon, 12 Oct 2009 05:17:23 +0000 (00:17 -0500)
diff --git a/CHANGES b/CHANGES

index be4d16d801141a4754865059e3cc9b2572113dfc..4c0ae707c7fcf38a5fc71903354730c1a5c41903 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -13,6 +13,7 @@ during upgrade will be listed here.
      + API.friends_ids and API.followers_ids now return a list of integers.
        Parser updated to handle cursor responses. See above.
      + Fix Status.source_url parsing
+    + Fix search result 'source' parsing to properly unescape html and extract source
  + Cursor
      Added the Cursor object to help with pagination within the API.
      Please see the pagination tutorial for more details.
diff --git a/tweepy/parsers.py b/tweepy/parsers.py

index 152de4c03c5648e7a81812a5e436e0afa9153f74..84fcf53a023c3f0e0d8efd24e5a6e1c415fa1d6f 100644 (file)
--- a/tweepy/parsers.py
+++ b/tweepy/parsers.py
@@ -2,6 +2,8 @@
  # Copyright 2009 Joshua Roesslein
  # See LICENSE
  
+import htmlentitydefs
+import re
  from datetime import datetime
  
  from . models import models
@@ -40,6 +42,28 @@ def _parse_search_datetime(str):
      return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000')
  
  
+def unescape_html(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
+
  def _parse_html_value(html):
  
      return html[html.find('>')+1:html.rfind('<')]
@@ -207,6 +231,8 @@ def _parse_search_result(obj, api):
      for k, v in obj.items():
          if k == 'created_at':
              setattr(result, k, _parse_search_datetime(v))
+        elif k == 'source':
+            setattr(result, k, _parse_html_value(unescape_html(v)))
          else:
              setattr(result, k, v)
      return result
author	Josh Roesslein <jroesslein@gmail.com>
	Mon, 12 Oct 2009 05:13:26 +0000 (00:13 -0500)
committer	Josh Roesslein <jroesslein@gmail.com>
	Mon, 12 Oct 2009 05:17:23 +0000 (00:17 -0500)
CHANGES		patch \| blob \| blame \| history
tweepy/parsers.py		patch \| blob \| blame \| history