From b8a80a18fd0362e5f7367db568b1a0358212d135 Mon Sep 17 00:00:00 2001 From: Andrew Engelbrecht Date: Thu, 21 Jan 2016 10:11:27 -0500 Subject: [PATCH] performance improvement for large emails very large emails around 4 MB were slowing down edward because it was using a complex regex. parsing a 4 MB was taking 4 days. this fix removes group matching, causing 4 MB files to be parsed in a matter of seconds. --- edward | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/edward b/edward index 49bdb65..b13ac7a 100755 --- a/edward +++ b/edward @@ -406,8 +406,7 @@ def scan_and_split (payload_piece, match_name, pattern): return [payload_piece] flags = re.DOTALL | re.MULTILINE - matches = re.search("(?P.*?)(?P" + pattern + - ")(?P.*)", payload_piece.string, flags=flags) + matches = re.search(pattern, payload_piece.string, flags=flags) if matches == None: pieces = [payload_piece] @@ -415,15 +414,15 @@ def scan_and_split (payload_piece, match_name, pattern): else: beginning = PayloadPiece() - beginning.string = matches.group('beginning') + beginning.string = payload_piece.string[:matches.start()] beginning.piece_type = payload_piece.piece_type match = PayloadPiece() - match.string = matches.group('match') + match.string = payload_piece.string[matches.start():matches.end()] match.piece_type = match_name rest = PayloadPiece() - rest.string = matches.group('rest') + rest.string = payload_piece.string[matches.end():] rest.piece_type = payload_piece.piece_type more_pieces = scan_and_split(rest, match_name, pattern) -- 2.25.1