From: Josh Roesslein Date: Mon, 12 Oct 2009 05:13:26 +0000 (-0500) Subject: Fix parsing of search result 'source' attribute. Properly unescape html. X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=0a954c9f7705417757e8d58ce12469453f7e3099;p=tweepy.git Fix parsing of search result 'source' attribute. Properly unescape html. --- diff --git a/CHANGES b/CHANGES index be4d16d..4c0ae70 100644 --- a/CHANGES +++ b/CHANGES @@ -13,6 +13,7 @@ during upgrade will be listed here. + API.friends_ids and API.followers_ids now return a list of integers. Parser updated to handle cursor responses. See above. + Fix Status.source_url parsing + + Fix search result 'source' parsing to properly unescape html and extract source + Cursor Added the Cursor object to help with pagination within the API. Please see the pagination tutorial for more details. diff --git a/tweepy/parsers.py b/tweepy/parsers.py index 152de4c..84fcf53 100644 --- a/tweepy/parsers.py +++ b/tweepy/parsers.py @@ -2,6 +2,8 @@ # Copyright 2009 Joshua Roesslein # See LICENSE +import htmlentitydefs +import re from datetime import datetime from . models import models @@ -40,6 +42,28 @@ def _parse_search_datetime(str): return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000') +def unescape_html(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + + def _parse_html_value(html): return html[html.find('>')+1:html.rfind('<')] @@ -207,6 +231,8 @@ def _parse_search_result(obj, api): for k, v in obj.items(): if k == 'created_at': setattr(result, k, _parse_search_datetime(v)) + elif k == 'source': + setattr(result, k, _parse_html_value(unescape_html(v))) else: setattr(result, k, v) return result