Har Mongo dump script

Here is my script, based on har example, to write requests and responses to MongoDB. It needs pymongo module.
I hope it helps someone.

import base64
import sys

from datetime import datetime, timezone
from mitmproxy import ctx
from mitmproxy.net.http import cookies
from mitmproxy.utils import strutils
from pymongo import MongoClient, WriteConcern
from pymongo.errors import ConnectionFailure
from traceback import print_exc


# A list of server seen till now is maintained so we can avoid
# using 'connect' time for entries that use an existing connection.
SERVERS_SEEN = set()

MONGODB_DB = "mitmproxy"
MONGODB_COLLECTION_REQUEST = "request"
MONGODB_COLLECTION_RESPONSE = "response"


class HarMongoDump(object):

    def __init__(self):
        self._SERVER = "localhost"
        self._PORT = 27017
        self._TIMEOUT = 5000
        self._MAX_POOL_SIZE = 200

        self.DB_CLIENT = None

    def configure(self, updated):
        self.DB_CLIENT = MongoClient(
            self._SERVER,
            self._PORT,
            connect=False,
            serverSelectionTimeoutMS=self._TIMEOUT,
            maxPoolSize=self._MAX_POOL_SIZE
        )

        #try db connection
        try:
            result = self.DB_CLIENT.admin.command("ismaster")
            ctx.log.info("mongo_dump configure MongoDB client initialized")
        except Exception:
            ctx.log.error("mongo_dump configure DB connection problem. (server=[{0}], port=[{1}], timeout=[{2}], max_pool_size=[{3}])".format(self._SERVER, self._PORT, self._TIMEOUT, self._MAX_POOL_SIZE))
            self.DB_CLIENT = None

    def request(self, flow):
        if self.DB_CLIENT is None:
            ctx.log.error("mongo_dump request DB client None!")
            return

        try:
            entry = self.get_request_har_entry(flow)

            db = self.DB_CLIENT.get_database(MONGODB_DB)
            collection = db[MONGODB_COLLECTION_REQUEST]
            try:
                insertResult = collection.insert_one(entry)
                ctx.log.debug("mongo_dump request request_id={}".format(insertResult.inserted_id))
            except Exception as e1:
                ctx.log.error("mongo_dump request DB insert error!")
                print_exc(file=sys.stderr)
                ctx.log.error(entry)
        except Exception as e2:
            ctx.log.error("mongo_dump request Exception.")
            print_exc(file=sys.stderr)
            ctx.log.error(entry)

    def response(self, flow):
        if self.DB_CLIENT is None:
            ctx.log.error("mongo_dump response DB client None!")
            return

        try:
            entry = self.get_response_har_entry(flow)

            db = self.DB_CLIENT.get_database(MONGODB_DB)
            collection = db[MONGODB_COLLECTION_RESPONSE]
            try:
                insertResult = collection.insert_one(entry)
                ctx.log.debug("mongo_dump response response_id={}".format(insertResult.inserted_id))
            except Exception as e1:
                ctx.log.error("mongo_dump response DB insert error!")
                print_exc(file=sys.stderr)
                ctx.log.error(entry)
        except Exception as e2:
            ctx.log.error("mongo_dump response Exception.")
            print_exc(file=sys.stderr)
            ctx.log.error(entry)

    def done(self):
        if self.DB_CLIENT is not None:
            self.DB_CLIENT.close()
            ctx.log.info("mongo_dump done MongoDB client closed.")


    def get_request_har_entry(self, flow):
        entry = {
            "startedDateTime": datetime.fromtimestamp(flow.request.timestamp_start, timezone.utc),
            "method": flow.request.method,
            "url": flow.request.url,
            "pretty_url": flow.request.pretty_url, #my added
            "pretty_host": flow.request.pretty_host, #my added
            "path": flow.request.path, #my added
            "httpVersion": flow.request.http_version,
            "cookies": self._format_request_cookies(flow.request.cookies.fields),
            "headers": self._name_value(flow.request.headers),
            "queryString": self._name_value(flow.request.query or {}),
            "headersSize": len(str(flow.request.headers)),
            "bodySize": len(flow.request.content)
        }

        if flow.request.method in ["POST", "PUT", "PATCH"]:
            params = [
                {"name": a, "value": b}
                for a, b in flow.request.urlencoded_form.items(multi=True)
            ]
            entry["postData"] = {
                "mimeType": flow.request.headers.get("Content-Type", ""),
                "text": flow.request.get_text(strict=False),
                "params": params
            }

        if flow.client_conn.clientcert:
            entry["clientCert"] = {
                "issuer": flow.client_conn.clientcert.issuer,
                "notbefore": flow.client_conn.clientcert.notbefore,
                "notafter": flow.client_conn.clientcert.notafter,
                "subject": flow.client_conn.clientcert.subject,
                "serial": flow.client_conn.clientcert.serial,
                "cn": flow.client_conn.clientcert.cn
            }

        entry["clientIPAddress"] = flow.client_conn.address[0]

        if flow.server_conn.connected():
            entry["serverIPAddress"] = str(flow.server_conn.ip_address[0])
            entry["serverPortAddress"] = str(flow.server_conn.ip_address[1])

        return entry

    def get_response_har_entry(self, flow):
        entry = {
            "startedDateTime": datetime.fromtimestamp(flow.request.timestamp_start, timezone.utc),
            "status": flow.response.status_code,
            "statusText": flow.response.reason,
            "httpVersion": flow.response.http_version,
            "cookies": self._format_response_cookies(flow.response.cookies.fields),
            "headers": self._name_value(flow.response.headers),
            "content": {},
            "redirectURL": flow.response.headers.get('Location', ''),
            "headersSize": len(str(flow.response.headers)),
            "bodySize": "",
#            "cache": {},
        }

        response_body_size = len(flow.response.raw_content)
        response_body_decoded_size = len(flow.response.content)
        response_body_compression = response_body_decoded_size - response_body_size

        entry["content"]["size"] = response_body_size
        entry["content"]["compression"] = response_body_compression,
        entry["content"]["mimeType"] = flow.response.headers.get('Content-Type', '')

        if strutils.is_mostly_bin(flow.response.content):
            entry["content"]["text"] = base64.b64encode(flow.response.content).decode() # Store binary data as base64
            entry["content"]["encoding"] = "base64"
        else:
            entry["content"]["text"] = flow.response.get_text(strict=False)

        entry["bodySize"] = response_body_size

        # -1 indicates that these values do not apply to current request
        connect_time = -1
        ssl_time = -1

        if flow.server_conn.timestamp_start is not None:
            if flow.server_conn and flow.server_conn.id not in SERVERS_SEEN:
                connect_time = (flow.server_conn.timestamp_tcp_setup -
                                flow.server_conn.timestamp_start)

                if flow.server_conn.timestamp_tls_setup is not None:
                    ssl_time = (flow.server_conn.timestamp_tls_setup -
                                flow.server_conn.timestamp_tcp_setup)

                SERVERS_SEEN.add(flow.server_conn.id)

        # Calculate raw timings from timestamps. DNS timings can not be calculated
        # for lack of a way to measure it. The same goes for HAR blocked.
        # mitmproxy will open a server connection as soon as it receives the host
        # and port from the client connection. So, the time spent waiting is actually
        # spent waiting between request.timestamp_end and response.timestamp_start
        # thus it correlates to HAR wait instead.
        timings_raw = {
            'send': flow.request.timestamp_end - flow.request.timestamp_start,
            'connect': connect_time,
            'ssl': ssl_time,
        }

        if flow.response.timestamp_start is not None:
            timings_raw['receive'] = flow.response.timestamp_end - flow.response.timestamp_start
            timings_raw['wait'] = flow.response.timestamp_start - flow.request.timestamp_end

        # HAR timings are integers in ms, so we re-encode the raw timings to that format.
        timings = dict([(k, int(1000 * v)) for k, v in timings_raw.items()])

        # full_time is the sum of all timings.
        # Timings set to -1 will be ignored as per spec.
        full_time = sum(v for v in timings.values() if v > -1)

        entry["timings"] = timings
        entry["time"] = full_time

        if flow.server_conn.connected():
            entry["serverIPAddress"] = str(flow.server_conn.ip_address[0])
            entry["serverPortAddress"] = str(flow.server_conn.ip_address[1])

        return entry

    def _format_cookies(self, cookie_list):
        rv = []

        for name, value, attrs in cookie_list:
            cookie_har = {
                "name": name,
                "value": value,
            }

            # HAR only needs some attributes
            for key in ["path", "domain", "comment"]:
                if key in attrs:
                    cookie_har[key] = attrs[key]

            # These keys need to be boolean!
            for key in ["httpOnly", "secure"]:
                cookie_har[key] = bool(key in attrs)

            # Expiration time needs to be formatted
            expire_ts = cookies.get_expiration_ts(attrs)
            if expire_ts is not None:
                cookie_har["expires"] = datetime.fromtimestamp(expire_ts, timezone.utc)

            rv.append(cookie_har)

        return rv

    def _format_request_cookies(self, fields):
        return self._format_cookies(cookies.group_cookies(fields))

    def _format_response_cookies(self, fields):
        return self._format_cookies((c[0], c[1][0], c[1][1]) for c in fields)

    def _name_value(self, obj):
        """
            Convert (key, value) pairs to HAR format.
        """
        return [{"name": k, "value": v} for k, v in obj.items()]

addons = [HarMongoDump()]
3 Likes