#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Quick and Dirty Tab Groups Dumper for Firefox
--snip--

Copyright (C) 2014 Stephan Sokolow

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

__appname__ = "Quick And Dirty Tab Groups Dumper for Firefox"
__author__ = "Stephan Sokolow (deitarion/SSokolow)"
__version__ = "0.2"
__license__ = "MIT"

import logging
log = logging.getLogger(__name__)

import json, os, re

#{{{ Data Types for Schema
bad_anchor_char_re = re.compile('[^A-Za-z0-9-_:.]+')
url_re = re.compile("""
                    ^data:image/png;base64,|
                    ^(chrome|file|https?)://|
                    ^about:(blank|home|newtab)$|
                    ^javascript:
                    """, re.VERBOSE)
is_nonempty_string = lambda x: isinstance(x, unicode) and x

is_url = lambda x: isinstance(x, unicode) and url_re.match(x)
is_nullable_url = lambda x: x is None or is_url(x)

is_natural_int = lambda x: isinstance(x, int) and x >= 0
is_positive_int = lambda x: is_natural_int(x) and x > 0

is_bool_string = lambda x: isinstance(x, unicode) and x in ('true', 'false')
def is_int_string(x):
    """Return true if the input is a string containing an integer"""
    if not is_nonempty_string(x):
        return False
    try:
        int(x)
        return True
    except ValueError:
        return False
is_int_ish = lambda x: is_int_string(x) or is_natural_int(x)

def is_coord_pair(x):
    """Return true if the input is two comma-separated integers"""
    return (is_nonempty_string(x) and
            len(x.split(',')) == 2 and
            all(is_int_string(y) for y in x.split(',')))

#}}}
#{{{ Schema

def apply_subschema(prefix, subschema):
    """Generate a new schema dict with prefixed paths"""
    return {prefix + x: y for x, y in subschema.items()}

TAB_ENTRY_SCHEMA = {
    'ID/': is_natural_int,
    'docIdentifier/': is_natural_int,
    'docshellID/': is_natural_int,
    'owner_b64/': is_nonempty_string,
    'referrer/': is_url,
    'scroll/': is_nonempty_string,
    'subframe/': lambda x: isinstance(x, bool),
    'title/': is_nonempty_string,
    'url/': is_url,

}

TABS_SCHEMA = {
    'tabs/': lambda x: isinstance(x, list) and x,
    'tabs/list/': lambda x: isinstance(x, dict) and x,
    'tabs/list/attributes/': lambda x: isinstance(x, dict),
    'tabs/list/attributes/image/': is_url,
    'tabs/list/entries/': lambda x: isinstance(x, list) and x,
    'tabs/list/entries/list/': lambda x: isinstance(x, dict) and x,
    'tabs/list/entries/list/ID/': is_natural_int,
    'tabs/list/entries/list/children/': lambda x: isinstance(x, list),
    'tabs/list/entries/list/children/list/': lambda x: isinstance(x, dict),
    'tabs/list/entries/list/url/': is_url,
    'tabs/list/extData/': lambda x: isinstance(x, dict) and x,
    'tabs/list/extData/tabview-tab/': is_nonempty_string,
    'tabs/list/extData/weaveLastUsed/': is_nonempty_string,
    'tabs/list/hidden/': lambda x: isinstance(x, bool),
    'tabs/list/image/': is_nullable_url,
    'tabs/list/index/': is_positive_int,
    'tabs/list/lastAccessed/': is_natural_int,
    'tabs/list/pinned/': lambda x: isinstance(x, bool),
    'tabs/list/scroll/': lambda x: isinstance(x, dict),
    'tabs/list/scroll/scroll/': is_nonempty_string,
}
TABS_SCHEMA.update(apply_subschema(
    'tabs/list/entries/list/', TAB_ENTRY_SCHEMA))
TABS_SCHEMA.update(apply_subschema(
    'tabs/list/entries/list/children/list/', TAB_ENTRY_SCHEMA))
TABS_SCHEMA.update(apply_subschema(
    'tabs/list/entries/list/children/list/children/list/', TAB_ENTRY_SCHEMA))

CLOSED_TABS_SCHEMA = {
    '_closedTabs/': lambda x: isinstance(x, list),
    '_closedTabs/list/': lambda x: isinstance(x, dict),
    '_closedTabs/list/closedAt/': is_natural_int,
    '_closedTabs/list/image/': is_url,
    '_closedTabs/list/pos/': is_positive_int,
    '_closedTabs/list/state/': lambda x: isinstance(x, dict),
    '_closedTabs/list/state/attributes/': lambda x: isinstance(x, dict),
    '_closedTabs/list/state/entries/': lambda x: isinstance(x, list),
    '_closedTabs/list/state/entries/list/': lambda x: isinstance(x, dict),
    '_closedTabs/list/state/extData/': lambda x: isinstance(x, dict),
    '_closedTabs/list/state/extData/tabview-tab/':
        lambda x: isinstance(x, unicode),
    '_closedTabs/list/state/hidden/': lambda x: isinstance(x, bool),
    '_closedTabs/list/state/image/': is_url,
    '_closedTabs/list/state/index/': is_positive_int,
    '_closedTabs/list/state/lastAccessed/': is_natural_int,
    '_closedTabs/list/state/scroll/': lambda x: isinstance(x, dict),
    '_closedTabs/list/state/scroll/scroll/': is_coord_pair,
    '_closedTabs/list/title/': lambda x: isinstance(x, unicode),
}
CLOSED_TABS_SCHEMA.update(apply_subschema(
    '_closedTabs/list/state/entries/list/', TAB_ENTRY_SCHEMA))

WINDOW_SCHEMA = {
    'extData/': lambda x: isinstance(x, dict),
    'extData/tabview-group/': is_nonempty_string,
    'extData/tabview-groups/': is_nonempty_string,
    'extData/tabview-ui/': is_nonempty_string,
    'height/': is_int_ish,
    'screenX/': is_int_ish,
    'screenY/': is_int_ish,
    'selected/': is_positive_int,
    'sizemode/': lambda x: isinstance(x, unicode),
    'title/': is_nonempty_string,
    'width/': is_int_ish,
}

#TODO: Finish deduplicating this schema definition
SCHEMA = {
    '': lambda x: isinstance(x, dict) and x,
    '_closedWindows/': lambda x: isinstance(x, list),
    '_closedWindows/list/': lambda x: isinstance(x, dict),
    '_closedWindows/list/closedAt/': lambda x: isinstance(x, int),
    '_closedWindows/list/extData/__SessionManagerWindowId/':
        lambda x: isinstance(x, unicode),
    'global/': lambda x: isinstance(x, dict),
    'scratchpads/': lambda x: isinstance(x, list),
    'selectedWindow/': is_natural_int,
    'session/': lambda x: isinstance(x, dict),
    'session/lastUpdate/': is_natural_int,
    'session/recentCrashes/': is_natural_int,
    'session/startTime/': is_natural_int,
    'windows/': lambda x: isinstance(x, list) and x,
    'windows/list/': lambda x: isinstance(x, dict) and x,
    'windows/list/busy/': lambda x: isinstance(x, bool),
    'windows/list/extData/': lambda x: isinstance(x, dict) and x,
    'windows/list/extData/__SessionManagerWindowId/': is_nonempty_string,
    'windows/list/extData/tabview-last-session-group-name/':
        is_nonempty_string,
    'windows/list/extData/tabview-visibility/': is_bool_string,

    'windows/list/tabs/list/disallow/': is_nonempty_string,
    'windows/list/tabs/list/entries/list/cacheKey/': is_positive_int,
    'windows/list/tabs/list/entries/list/children/list/children/':
        lambda x: isinstance(x, list),
    'windows/list/tabs/list/entries/list/children/list/children/list/':
        lambda x: isinstance(x, dict),
    'windows/list/tabs/list/entries/list/structuredCloneState/':
        is_nonempty_string,
    'windows/list/tabs/list/entries/list/structuredCloneVersion/':
        is_positive_int,
    'windows/list/tabs/list/extData/': lambda x: isinstance(x, dict),
    'windows/list/tabs/list/extData/tabview-tab/': is_nonempty_string,
    'windows/list/tabs/list/extData/weaveLastUsed/': is_nonempty_string,
    'windows/list/tabs/list/lastAccessed/': is_natural_int,
    'windows/list/tabs/list/pageStyle/':
        lambda x: isinstance(x, (unicode, dict)),
    'windows/list/tabs/list/pageStyle/pageStyle/': is_nonempty_string,
    'windows/list/tabs/list/userTypedClear/': is_natural_int,
    'windows/list/tabs/list/userTypedValue/': is_nonempty_string,
}
SCHEMA.update(apply_subschema('_closedWindows/list/', CLOSED_TABS_SCHEMA))
SCHEMA.update(apply_subschema('_closedWindows/list/', TABS_SCHEMA))
SCHEMA.update(apply_subschema('_closedWindows/list/', WINDOW_SCHEMA))
SCHEMA.update(apply_subschema('windows/list/', CLOSED_TABS_SCHEMA))
SCHEMA.update(apply_subschema('windows/list/', TABS_SCHEMA))
SCHEMA.update(apply_subschema('windows/list/', WINDOW_SCHEMA))

#}}}
#{{{ Schema-related routines

def make_schema_line(path, data):
    """Generate a draft schema line to be copy-pasted"""
    tmp = "'%s': lambda x: isinstance(x, %s)," % (path, type(data).__name__)
    return '%-100s # %s' % (tmp, repr(data)[:80])

def check_schema(data, dom_path='', schema=None, make_schema=False):
    """Recursive exploration for JSON via schemas"""
    schema = schema or {}
    result = {}

    def fail(msg):
        """Unified failure message"""
        dump = result[dom_path] = make_schema_line(dom_path, data)
        if isinstance(data, (list, dict)):
            raise ValueError("%s: %s @ %s\n    %s" % (
                msg, type(data), dom_path, dump))
        else:
            raise ValueError("%s: %s(%s) @ %s\n    %s" % (
                msg, type(data), data, dom_path, dump))

    if dom_path not in schema:
        if make_schema:
            result[dom_path] = make_schema_line(dom_path, data)
        else:
            fail("Unexpected element")

    if make_schema or schema[dom_path](data):
        if isinstance(data, list):
            for x in data:
                result.update(check_schema(x, dom_path + 'list/',
                                schema=schema, make_schema=make_schema))
        elif isinstance(data, dict):
            for x in data:
                result.update(check_schema(data[x], '%s%s/' % (dom_path, x),
                                schema=schema, make_schema=make_schema))
        elif not (data is None or isinstance(data, (int, float, unicode))):
            fail("Unexpected data type")
    else:
        fail("Element failed schema")

    return result

#}}}

def _collect_tab_metadata(tab):
    """Restructure the tab metadata into a single dict"""
    grp_id = None
    grp_data = tab.get('extData', {}).get('tabview-tab', {})
    if grp_data:
        grp_data = json.loads(grp_data)
    if grp_data:
        grp_id = grp_data.get('groupID', None)
        del grp_data

    empty = {
            'title' : 'New tab',
            'url'   : 'about:newtab',
    }
    if len(tab['entries']) == 0:
        content = empty
    else:
        content = tab['entries'][-1] # -1 is most recent

    return {
        'index': tab.get('index'),
        'group': grp_id,
        'title': content.get('title', content['url']),
        'url': content['url'],
        'favicon': tab.get('image', None),
        'pinned': tab.get('pinned', False)
    }

def dump_tab_groups(data):
    """Load a JSON file from a path and extract the relevant data structure."""
    windows = []
    for window in data['windows']:
        grp_names = json.loads(window.get('extData', {}).get(
                                'tabview-group', '{}'))
        grp_names = {int(x): grp_names[x]['title'] for x in grp_names.keys()}

        tabs = [_collect_tab_metadata(tab) for tab in window['tabs']]

        # Group the tabs by group ID and then replace the IDs with the
        # group names without risking naming collisions
        groups = {}
        [groups.setdefault(tab['group'], []).append(tab) for tab in tabs]
        groups = [(grp_names.get(k, None), v) for k, v in groups.items()]
        #groups.sort()  # TODO: Sort case-insensitively

        windows.append(groups)
    return windows

def dump_to_html(dump, for_tiddlywiki=False):
    """Convert `dump_tab_groups` output to HTML

    Specifically, an outline represented in HTML via the XOXO microformat
    so that it's both human- and machine-readable:
        http://www.microformats.org/wiki/xoxo

    @todo: Clean up this code and reuse anything possible to produce an XBEL
        output option:
            http://en.wikipedia.org/wiki/XBEL
    """
    from lxml import etree
    from lxml.builder import E

    def attr_class(*args):
        """workaround for `class` being a reserved word"""
        return {"class": ' '.join(args)}

    h3_prefix = lambda: (E.span('!!!', attr_class('copy-only'))
                 if for_tiddlywiki else '')

    noicon = PLACEHOLDER_FAVICON_URI

    tab_count, windows = 0, []
    for pos, window in enumerate(dump):
        k_toc = []
        e_groups = []

        for group, tabs in window:
            if not group and all(x.get('pinned') for x in tabs):
                grp_name = "<Pinned Tabs>"
            else:
                grp_name = group or '<Unnamed Group>'
            grp_key = (grp_name, bad_anchor_char_re.sub('_', grp_name).lower())

            grp_key_actual, idx = grp_key, 0
            while grp_key_actual in k_toc:
                idx += 1
                grp_key_actual = '%s%s' % (grp_key, idx)
            k_toc.append(grp_key_actual)

            tab_count += len(tabs)
            if for_tiddlywiki:
                lines = []
                for x in tabs:
                    lines.extend(['* [[', E.b(x['title']), '|',
                                 E.a(x['url'], href=x['url']),
                                 ']]\n'])
                e_tabs = E.pre(*lines)
            else:
                e_tabs = E.ul(*[E.li(E.a(E.img(src=x['favicon'] or noicon,
                                         alt='', width='16', height='16'),
                    x['title'], href=x['url'])) for x in tabs])

            e_groups.append(E.li(
                E.h3(h3_prefix(), grp_name, id=grp_key_actual[1]),
                e_tabs
            ))

        windows.append(E.li(E.h2("Window %d" % pos), *e_groups))

    e_toc = E.ul(*[E.li(E.a(x[0], href='#%s' % x[1])) for x in k_toc])

    title_str = '%s (%s tabs total)' % (HTML_EXPORT_TITLE, tab_count)
    return etree.tostring(E.html(
        E.head(
            E.title(title_str),
            E.style(HTML_EXPORT_STYLE)),
        E.body(
            E.h1(title_str), e_toc,
            E.ul(attr_class('xoxo'), *windows))))


def dump_to_text(dump):
    """Convert `dump_tab_groups` output to text
    """
    tab_count, windows = 0, []
    text = ''
    for pos, window in enumerate(dump):
        text += "Window {0}\n".format(pos)

        for group, tabs in window:
            if not group and all(x.get('pinned') for x in tabs):
                grp_name = "<Pinned Tabs>"
            else:
                grp_name = group or '<Unnamed Group>'
            text += "{0}\n".format(grp_name)
            tab_count += len(tabs)
            for x in tabs:
                text += "\t{1}\n{0}\n".format(x['url'], x['title'])
        text += "\n"

    text += '({0} tabs total)'.format(tab_count)

    return text

# From https://gist.github.com/Tblue/62ff47bef7f894e92ed5
def decompress(file_obj):
    import lz4.block
    if file_obj.read(8) != b"mozLz40\0":
        raise "Invalid magic number"
    return lz4.block.decompress(file_obj.read())


if __name__ == '__main__':
    from optparse import OptionParser
    parser = OptionParser(version="%%prog v%s" % __version__,
            usage="%prog [opts] [path]",
            description=__doc__.replace('\r\n', '\n').split('\n--snip--\n')[0])
    parser.add_option('-v', '--verbose', action="count", dest="verbose",
        default=2, help="Increase the verbosity. Use twice for extra effect")
    parser.add_option('-q', '--quiet', action="count", dest="quiet",
        default=0, help="Decrease the verbosity. Use twice for extra effect")
    parser.add_option('--check-schema', action="store_true",
        dest="check_schema", default=False, help="Check JSON against the "
        "embedded sessionstore schema")
    parser.add_option('--make-schema', action="store_true", dest="make_schema",
        default=False, help="Auto-generate a first draft schema from the JSON")

    # Allow pre-formatted descriptions
    parser.formatter.format_description = lambda description: description

    opts, args = parser.parse_args()

    # Set up clean logging to stderr
    log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING,
                  logging.INFO, logging.DEBUG]
    opts.verbose = min(opts.verbose - opts.quiet, len(log_levels) - 1)
    opts.verbose = max(opts.verbose, 0)
    logging.basicConfig(level=log_levels[opts.verbose],
                        format='%(levelname)s: %(message)s')

    if not args:
        import glob
        args = glob.glob(os.path.expanduser(
            '~/.mozilla/firefox/*/sessionstore-backups/recovery.jsonlz4'))

    results = []
    for arg in args:
        with open(arg, 'rb') as fobj:
            data = decompress(fobj)
            data = json.loads(data)

        if opts.check_schema:
            result = check_schema(data, schema=SCHEMA)
        elif opts.make_schema:
            result = check_schema(data, schema=SCHEMA, make_schema=True)
            print("---------------")
            print('\n'.join(sorted(result.values())))
        else:
            results += dump_tab_groups(data)
        text = dump_to_text(results)
        print(text)