Rip out BeautifulSoup, and use a custom XML-ish prettyprinter.

2026-04-26 05:14:57 +08:00 · 2011-02-06 14:17:30 +13:00
parent 44dc3a052e
commit 7156d1a73a
5 changed files with 98 additions and 2036 deletions
--- a/libmproxy/utils.py
+++ b/libmproxy/utils.py
@@ -12,9 +12,7 @@
 # 
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import re, os, subprocess, datetime
-from contrib import BeautifulSoup
+import re, os, subprocess, datetime, textwrap


 def format_timestamp(s):
@@ -48,14 +46,48 @@ def cleanBin(s):
    return "".join(parts)
    

-def prettybody(s):
+TAG = r"""
+        <\s*
+        (?!\s*[!"])
+        (?P<close>\s*\/)?
+        (?P<name>\w+)
+        (
+                [a-zA-Z0-9_#:=().%\/]+
+            |
+                "[^\"]*"['\"]*
+            |
+                '[^']*'['\"]*
+            | 
+                \s+
+        )*
+        (?P<selfcont>\s*\/\s*)?
+        \s*>
+      """
+UNI = set(["br", "hr", "img", "input", "area", "link"])
+INDENT = " "*4
+def pretty_xmlish(s):
    """
-        Return a list of pretty-printed lines.
+        This is a robust, general pretty-printer for XML-ish data. 
+        Returns a list of lines.
    """
-    s = BeautifulSoup.BeautifulStoneSoup(s)
-    s = s.prettify().strip()
-    parts = s.split("\n")
-    return [repr(i)[1:-1] for i in parts]
+    data, offset, indent, prev = [], 0, 0, None
+    for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
+        start, end = i.span()
+        name = i.group("name")
+        if start > offset:
+            txt = []
+            for x in textwrap.dedent(s[offset:start]).split("\n"):
+                if x.strip():
+                    txt.append(indent*INDENT + x)
+            data.extend(txt)
+        if i.group("close") and not (name in UNI and name==prev):
+            indent = max(indent - 1, 0)
+        data.append(indent*INDENT + i.group().strip())
+        offset = end
+        if not any([i.group("close"), i.group("selfcont"), name in UNI]):
+            indent += 1
+        prev = name
+    return data


 def hexdump(s):