Make PuSHing the Pubhubsubbub server an async task (#436, #585)
authorSebastian Spaeth <Sebastian@SSpaeth.de>
Tue, 15 Jan 2013 13:41:30 +0000 (14:41 +0100)
committerSebastian Spaeth <Sebastian@SSpaeth.de>
Tue, 15 Jan 2013 13:53:08 +0000 (14:53 +0100)
Notifying the PuSH servers had 3 problems. 

1) it was done immediately after sending of the processing task to celery. So if celery was run in a separate
process we would notify the PuSH servers before the new media was processed/
visible. (#436)

2) Notification code was called in submit/views.py, so submitting via the
   API never resulted in notifications. (#585)

3) If Notifying the PuSH server failed, we would never retry.

The solution was to make the PuSH notification an asynchronous subtask. This
way: 1) it will only be called once async processing has finished, 2) it
is in the main processing code path, so even API calls will result in
notifications, and 3) We retry 3 times in case of failure before giving up.
If the server is in a separate process, we will wait 3x 2 minutes before
retrying the notification.

The only downside is that the celery server needs to have access to the internet
to ping the PuSH server. If that is a problem, we need to make the task belong
to a special group of celery servers that has access to the internet.

As a side effect, I believe I removed the limitation that prevented us from
upgrading celery.

Signed-off-by: Sebastian Spaeth <Sebastian@SSpaeth.de>
mediagoblin/plugins/api/views.py
mediagoblin/processing/task.py
mediagoblin/submit/lib.py
mediagoblin/submit/views.py

index 6aa4ef9fd91120b043f7f9dde0b9ec4a61cf500e..7383e20d039de9138737907ded5fd4a0251326c3 100644 (file)
@@ -86,7 +86,7 @@ def post_entry(request):
     #
     # (... don't change entry after this point to avoid race
     # conditions with changes to the document via processing code)
-    run_process_media(entry)
+    run_process_media(entry, request)
 
     return json_response(get_entry_serializable(entry, request.urlgen))
 
index b29de9bd89883656558c7384b83983e67f33256c..2cdd5f1a255de23d0aa97ce94472fdf65af94a4b 100644 (file)
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import logging
+import urllib
+import urllib2
 
-from celery.task import Task
+from celery import registry, task
 
 from mediagoblin import mg_globals as mgg
 from mediagoblin.db.models import MediaEntry
@@ -28,18 +30,51 @@ logging.basicConfig()
 _log.setLevel(logging.DEBUG)
 
 
+@task.task(default_retry_delay=2 * 60)
+def handle_push_urls(feed_url):
+    """Subtask, notifying the PuSH servers of new content
+
+    Retry 3 times every 2 minutes if run in separate process before failing."""
+    if not mgg.app_config["push_urls"]:
+        return # Nothing to do
+    _log.debug('Notifying Push servers for feed {0}'.format(feed_url))
+    hubparameters = {
+        'hub.mode': 'publish',
+        'hub.url': feed_url}
+    hubdata = urllib.urlencode(hubparameters)
+    hubheaders = {
+        "Content-type": "application/x-www-form-urlencoded",
+        "Connection": "close"}
+    for huburl in mgg.app_config["push_urls"]:
+        hubrequest = urllib2.Request(huburl, hubdata, hubheaders)
+        try:
+            hubresponse = urllib2.urlopen(hubrequest)
+        except (urllib2.HTTPError, urllib2.URLError) as exc:
+            # We retry by default 3 times before failing
+            _log.info("PuSH url %r gave error %r", huburl, exc)
+            try:
+                return handle_push_urls.retry(exc=exc, throw=False)
+            except Exception as e:
+                # All retries failed, Failure is no tragedy here, probably.
+                _log.warn('Failed to notify PuSH server for feed {0}. '
+                          'Giving up.'.format(feed_url))
+                return False
+
 ################################
 # Media processing initial steps
 ################################
 
-class ProcessMedia(Task):
+class ProcessMedia(task.Task):
     """
     Pass this entry off for processing.
     """
-    def run(self, media_id):
+    def run(self, media_id, feed_url):
         """
         Pass the media entry off to the appropriate processing function
         (for now just process_image...)
+
+        :param feed_url: The feed URL that the PuSH server needs to be
+            updated for.
         """
         entry = MediaEntry.query.get(media_id)
 
@@ -58,6 +93,9 @@ class ProcessMedia(Task):
             entry.state = u'processed'
             entry.save()
 
+            # Notify the PuSH servers as async task
+            handle_push_urls.subtask().delay(feed_url)
+
             json_processing_callback(entry)
         except BaseProcessingFail as exc:
             mark_entry_failed(entry.id, exc)
@@ -97,3 +135,7 @@ class ProcessMedia(Task):
 
         entry = mgg.database.MediaEntry.query.filter_by(id=entry_id).first()
         json_processing_callback(entry)
+
+# Register the task
+process_media = registry.tasks[ProcessMedia.name]
+
index db5dfe534024422ee78b94a3ff3b7cf031b4c100..ba07c6fa994954e624ec1a4566baf04a57ec4339 100644 (file)
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import urllib
-import urllib2
 import logging
 import uuid
-from celery import registry
 from werkzeug.utils import secure_filename
 
-from mediagoblin import mg_globals
 from mediagoblin.processing import mark_entry_failed
-from mediagoblin.processing.task import ProcessMedia
+from mediagoblin.processing.task import process_media
 
 
 _log = logging.getLogger(__name__)
@@ -58,11 +54,13 @@ def prepare_queue_task(app, entry, filename):
     return queue_file
 
 
-def run_process_media(entry):
-    process_media = registry.tasks[ProcessMedia.name]
+def run_process_media(entry, request):
+    feed_url = request.urlgen(
+        'mediagoblin.user_pages.atom_feed',
+        qualified=True, user=request.user.username)
     try:
         process_media.apply_async(
-            [unicode(entry.id)], {},
+            [entry.id, feed_url], {},
             task_id=entry.queued_task_id)
     except BaseException as exc:
         # The purpose of this section is because when running in "lazy"
@@ -76,30 +74,3 @@ def run_process_media(entry):
         mark_entry_failed(entry.id, exc)
         # re-raise the exception
         raise
-
-
-def handle_push_urls(request):
-    if mg_globals.app_config["push_urls"]:
-        feed_url = request.urlgen(
-                           'mediagoblin.user_pages.atom_feed',
-                           qualified=True,
-                           user=request.user.username)
-        hubparameters = {
-            'hub.mode': 'publish',
-            'hub.url': feed_url}
-        hubdata = urllib.urlencode(hubparameters)
-        hubheaders = {
-            "Content-type": "application/x-www-form-urlencoded",
-            "Connection": "close"}
-        for huburl in mg_globals.app_config["push_urls"]:
-            hubrequest = urllib2.Request(huburl, hubdata, hubheaders)
-            try:
-                hubresponse = urllib2.urlopen(hubrequest)
-            except urllib2.HTTPError as exc:
-                # This is not a big issue, the item will be fetched
-                # by the PuSH server next time we hit it
-                _log.warning(
-                    "push url %r gave error %r", huburl, exc.code)
-            except urllib2.URLError as exc:
-                _log.warning(
-                    "push url %r is unreachable %r", huburl, exc.reason)
index 2d609b31e2528a729c33a6823b23105c3dd7d603..145b9f5e0b7b122d75258037e78dc14d23ec694d 100644 (file)
@@ -32,8 +32,7 @@ from mediagoblin.submit import forms as submit_forms
 from mediagoblin.messages import add_message, SUCCESS
 from mediagoblin.media_types import sniff_media, \
     InvalidFileType, FileTypeNotSupported
-from mediagoblin.submit.lib import handle_push_urls, run_process_media, \
-    prepare_queue_task
+from mediagoblin.submit.lib import run_process_media, prepare_queue_task
 
 
 @require_active_login
@@ -90,10 +89,7 @@ def submit_start(request):
                 #
                 # (... don't change entry after this point to avoid race
                 # conditions with changes to the document via processing code)
-                run_process_media(entry)
-
-                handle_push_urls(request)
-
+                run_process_media(entry, request)
                 add_message(request, SUCCESS, _('Woohoo! Submitted!'))
 
                 return redirect(request, "mediagoblin.user_pages.user_home",