From: CYBERDEViLNL Date: Mon, 29 Oct 2018 23:05:48 +0000 (+0100) Subject: * Fetch conversation messages with BS4 and regex as fallback. X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=8a24529bc1724fd46e22940cb449d4594a30e8d2;p=diaspy.git * Fetch conversation messages with BS4 and regex as fallback. -- To fetch messages call `update_messages` on a `models.Conversation` object. -- You may now do `iter`, `len` and `getitem` with a `models.Conversation` object to get messages or to get the message count. -- Call `messages` in a `models.Conversation` object to receive a list with messages (dict) * BS4 for Notification model and kept regex as fallback. * Did a trick in Notifications who() function to get a unique list without duplicates. --- diff --git a/diaspy/errors.py b/diaspy/errors.py index d5d3f0e..48d754a 100644 --- a/diaspy/errors.py +++ b/diaspy/errors.py @@ -111,7 +111,7 @@ class SearchError(DiaspyError): pass class TagError(DiaspyError): - """Exception raised when something related to settings goes wrong. + """Exception raised when something related to a tag goes wrong. """ pass diff --git a/diaspy/models.py b/diaspy/models.py index fd2a643..e431665 100644 --- a/diaspy/models.py +++ b/diaspy/models.py @@ -7,7 +7,15 @@ MUST NOT import anything. import json -import re +import copy + +BS4_SUPPORT=False +try: + from bs4 import BeautifulSoup +except ImportError: + import re + print("BeautifulSoup not found, falling back on regex.") +else: BS4_SUPPORT=True from diaspy import errors @@ -130,10 +138,10 @@ class Aspect(): class Notification(): """This class represents single notification. """ - _who_regexp = re.compile(r'/people/([0-9a-f]+)["\']{1} class=["\']{1}hovercardable') - _when_regexp = re.compile(r'[0-9]{4,4}(-[0-9]{2,2}){2,2} [0-9]{2,2}(:[0-9]{2,2}){2,2} UTC') - _aboutid_regexp = re.compile(r'/posts/[0-9a-f]+') - _htmltag_regexp = re.compile('') + if not BS4_SUPPORT: + _who_regexp = re.compile(r'/people/([0-9a-f]+)["\']{1} class=["\']{1}hovercardable') + _aboutid_regexp = re.compile(r'/posts/[0-9a-f]+') + _htmltag_regexp = re.compile('') def __init__(self, connection, data): self._connection = connection @@ -150,10 +158,17 @@ class Notification(): def __str__(self): """Returns notification note. """ - string = re.sub(self._htmltag_regexp, '', self._data['note_html']) - string = string.strip().split('\n')[0] - while ' ' in string: string = string.replace(' ', ' ') - return string + if BS4_SUPPORT: + soup = BeautifulSoup(self._data['note_html'], 'lxml') + media_body = soup.find('div', {"class": "media-body"}) + div = media_body.find('div') + if div: div.decompose() + return media_body.getText().strip() + else: + string = re.sub(self._htmltag_regexp, '', self._data['note_html']) + string = string.strip().split('\n')[0] + while ' ' in string: string = string.replace(' ', ' ') + return string def __repr__(self): """Returns notification note with more details. @@ -164,15 +179,26 @@ class Notification(): """Returns id of post about which the notification is informing OR: If the id is None it means that it's about user so .who() is called. """ - about = self._aboutid_regexp.search(self._data['note_html']) - if about is None: about = self.who() - else: about = int(about.group(0)[7:]) - return about + if BS4_SUPPORT: + soup = BeautifulSoup(self._data['note_html'], 'lxml') + id = soup.find('a', {"data-ref": True}) + if id: return id['data-ref'] + else: return self.who()[0] + else: + about = self._aboutid_regexp.search(self._data['note_html']) + if about is None: about = self.who()[0] + else: about = int(about.group(0)[7:]) + return about def who(self): """Returns list of guids of the users who caused you to get the notification. """ - return [who for who in self._who_regexp.findall(self._data['note_html'])] + if BS4_SUPPORT: # Parse the HTML with BS4 + soup = BeautifulSoup(self._data['note_html'], 'lxml') + hovercardable_soup = soup.findAll('a', {"class": "hovercardable"}) + return list(set([soup['href'][8:] for soup in hovercardable_soup])) + else: + return list(set([who for who in self._who_regexp.findall(self._data['note_html'])])) def when(self): """Returns UTC time as found in note_html. @@ -199,6 +225,14 @@ class Conversation(): .. note:: Remember that you need to have access to the conversation. """ + if not BS4_SUPPORT: + _message_stream_regexp = re.compile(r'
(.*?)
', re.DOTALL) + _message_guid_regexp = re.compile(r'data-guid=["\']{1}([0-9]+)["\']{1}') + _message_created_at_regexp = re.compile(r'
', re.DOTALL) + _message_author_guid_regexp = re.compile(r'