From b8e6933ec01d08f26ca5784585583ae16529cb16 Mon Sep 17 00:00:00 2001 From: tim Date: Wed, 22 Apr 2020 16:49:49 +0200 Subject: [PATCH] Add some tools for copying data out of Firefox, for backups. --- copy_firefox_data.sh | 76 ++++++ firefox_tabgroups_export.py | 456 ++++++++++++++++++++++++++++++++++++ get_firefox_tabs.py | 71 ------ 3 files changed, 532 insertions(+), 71 deletions(-) create mode 100755 copy_firefox_data.sh create mode 100755 firefox_tabgroups_export.py delete mode 100755 get_firefox_tabs.py diff --git a/copy_firefox_data.sh b/copy_firefox_data.sh new file mode 100755 index 0000000..1cab5aa --- /dev/null +++ b/copy_firefox_data.sh @@ -0,0 +1,76 @@ +#!/bin/sh +# +# This script makes local backups of tabs and passwords from Firefox, and then regularly copies them to a backup server +# +# firefox_decrypt.py is available from https://github.com/Unode/firefox_decrypt +# +# firefox_tabgroups_export.py is available from https://gist.github.com/ssokolow/35097553e5173935e597 +# The version in this repo modifes the original to export only as text +# + +epoch=$( date +%Y%m%d%H%M ); +#server=10.25.0.1 +server=treehouse.org.za +backups=~/Downloads/firefox + +copy() { + content=$1 + if [ -e ${content} ] && ping -qc1 ${server} >/dev/null; then + rsync -a --delete -e 'ssh -i ~/.ssh/id_rsa-firefox_pass' ${content} ${server}:remember/ + #scp -ri ~/.ssh/id_rsa-firefox_pass ${backups}/firefox_tabs.${epoch} ${server}:remember/tabs + fi +} + +if [ -x ~/bin/firefox_tabgroups_export.py ]; then + do_copy=0 + mkdir -p ${backups}/tabs + mkdir -p ${backups}/tabs/sessionstore + previous=$( ls ${backups}/tabs/firefox_tabs.* | tail -1 ) + ~/bin/firefox_tabgroups_export.py > ${backups}/tabs/firefox_tabs.${epoch} + if [ -f ${previous} ]; then + if diff -q ${previous} ${backups}/tabs/firefox_tabs.${epoch} >/dev/null; then + # No difference, remove this file + rm ${backups}/tabs/firefox_tabs.${epoch} + else + do_copy=1 + fi + else + do_copy=1 + fi + + # Delete backups older than $time + find ${backups}/tabs -ctime +365 -delete + + if [ ${do_copy} ]; then + for session in ~/.mozilla/firefox/*/sessionstore*/*; do + sessionfile=$( basename ${session} ) + cp ${session} ${backups}/tabs/sessionstore/${epoch}_${sessionfile} + done + copy ${backups}/tabs + fi +fi + +if [ -x ~/bin/firefox_decrypt.py ]; then + do_copy=0 + mkdir -p ${backups}/passwords + previous=$( ls ${backups}/passwords/firefox_passwords.* | tail -1 ) + echo '' | ~/bin/firefox_decrypt.py > ${backups}/passwords/firefox_passwords.${epoch} + if [ -f ${previous} ]; then + if diff -q ${previous} ${backups}/passwords/firefox_passwords.${epoch} >/dev/null; then + # No difference, remove this file + rm ${backups}/passwords/firefox_passwords.${epoch} + else + do_copy=1 + fi + else + do_copy=1 + fi + + # Delete backups older than $time + find ${backups}/passwords -ctime +1 -delete + + if [ ${do_copy} ]; then + ln -srf ${backups}/passwords/firefox_passwords.${epoch} ${backups}/passwords/firefox_passwords + copy ${backups}/passwords + fi +fi diff --git a/firefox_tabgroups_export.py b/firefox_tabgroups_export.py new file mode 100755 index 0000000..16698f5 --- /dev/null +++ b/firefox_tabgroups_export.py @@ -0,0 +1,456 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Quick and Dirty Tab Groups Dumper for Firefox +--snip-- + +Copyright (C) 2014 Stephan Sokolow + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +__appname__ = "Quick And Dirty Tab Groups Dumper for Firefox" +__author__ = "Stephan Sokolow (deitarion/SSokolow)" +__version__ = "0.2" +__license__ = "MIT" + +import logging +log = logging.getLogger(__name__) + +import json, os, re + +#{{{ Data Types for Schema +bad_anchor_char_re = re.compile('[^A-Za-z0-9-_:.]+') +url_re = re.compile(""" + ^data:image/png;base64,| + ^(chrome|file|https?)://| + ^about:(blank|home|newtab)$| + ^javascript: + """, re.VERBOSE) +is_nonempty_string = lambda x: isinstance(x, unicode) and x + +is_url = lambda x: isinstance(x, unicode) and url_re.match(x) +is_nullable_url = lambda x: x is None or is_url(x) + +is_natural_int = lambda x: isinstance(x, int) and x >= 0 +is_positive_int = lambda x: is_natural_int(x) and x > 0 + +is_bool_string = lambda x: isinstance(x, unicode) and x in ('true', 'false') +def is_int_string(x): + """Return true if the input is a string containing an integer""" + if not is_nonempty_string(x): + return False + try: + int(x) + return True + except ValueError: + return False +is_int_ish = lambda x: is_int_string(x) or is_natural_int(x) + +def is_coord_pair(x): + """Return true if the input is two comma-separated integers""" + return (is_nonempty_string(x) and + len(x.split(',')) == 2 and + all(is_int_string(y) for y in x.split(','))) + +#}}} +#{{{ Schema + +def apply_subschema(prefix, subschema): + """Generate a new schema dict with prefixed paths""" + return {prefix + x: y for x, y in subschema.items()} + +TAB_ENTRY_SCHEMA = { + 'ID/': is_natural_int, + 'docIdentifier/': is_natural_int, + 'docshellID/': is_natural_int, + 'owner_b64/': is_nonempty_string, + 'referrer/': is_url, + 'scroll/': is_nonempty_string, + 'subframe/': lambda x: isinstance(x, bool), + 'title/': is_nonempty_string, + 'url/': is_url, + +} + +TABS_SCHEMA = { + 'tabs/': lambda x: isinstance(x, list) and x, + 'tabs/list/': lambda x: isinstance(x, dict) and x, + 'tabs/list/attributes/': lambda x: isinstance(x, dict), + 'tabs/list/attributes/image/': is_url, + 'tabs/list/entries/': lambda x: isinstance(x, list) and x, + 'tabs/list/entries/list/': lambda x: isinstance(x, dict) and x, + 'tabs/list/entries/list/ID/': is_natural_int, + 'tabs/list/entries/list/children/': lambda x: isinstance(x, list), + 'tabs/list/entries/list/children/list/': lambda x: isinstance(x, dict), + 'tabs/list/entries/list/url/': is_url, + 'tabs/list/extData/': lambda x: isinstance(x, dict) and x, + 'tabs/list/extData/tabview-tab/': is_nonempty_string, + 'tabs/list/extData/weaveLastUsed/': is_nonempty_string, + 'tabs/list/hidden/': lambda x: isinstance(x, bool), + 'tabs/list/image/': is_nullable_url, + 'tabs/list/index/': is_positive_int, + 'tabs/list/lastAccessed/': is_natural_int, + 'tabs/list/pinned/': lambda x: isinstance(x, bool), + 'tabs/list/scroll/': lambda x: isinstance(x, dict), + 'tabs/list/scroll/scroll/': is_nonempty_string, +} +TABS_SCHEMA.update(apply_subschema( + 'tabs/list/entries/list/', TAB_ENTRY_SCHEMA)) +TABS_SCHEMA.update(apply_subschema( + 'tabs/list/entries/list/children/list/', TAB_ENTRY_SCHEMA)) +TABS_SCHEMA.update(apply_subschema( + 'tabs/list/entries/list/children/list/children/list/', TAB_ENTRY_SCHEMA)) + +CLOSED_TABS_SCHEMA = { + '_closedTabs/': lambda x: isinstance(x, list), + '_closedTabs/list/': lambda x: isinstance(x, dict), + '_closedTabs/list/closedAt/': is_natural_int, + '_closedTabs/list/image/': is_url, + '_closedTabs/list/pos/': is_positive_int, + '_closedTabs/list/state/': lambda x: isinstance(x, dict), + '_closedTabs/list/state/attributes/': lambda x: isinstance(x, dict), + '_closedTabs/list/state/entries/': lambda x: isinstance(x, list), + '_closedTabs/list/state/entries/list/': lambda x: isinstance(x, dict), + '_closedTabs/list/state/extData/': lambda x: isinstance(x, dict), + '_closedTabs/list/state/extData/tabview-tab/': + lambda x: isinstance(x, unicode), + '_closedTabs/list/state/hidden/': lambda x: isinstance(x, bool), + '_closedTabs/list/state/image/': is_url, + '_closedTabs/list/state/index/': is_positive_int, + '_closedTabs/list/state/lastAccessed/': is_natural_int, + '_closedTabs/list/state/scroll/': lambda x: isinstance(x, dict), + '_closedTabs/list/state/scroll/scroll/': is_coord_pair, + '_closedTabs/list/title/': lambda x: isinstance(x, unicode), +} +CLOSED_TABS_SCHEMA.update(apply_subschema( + '_closedTabs/list/state/entries/list/', TAB_ENTRY_SCHEMA)) + +WINDOW_SCHEMA = { + 'extData/': lambda x: isinstance(x, dict), + 'extData/tabview-group/': is_nonempty_string, + 'extData/tabview-groups/': is_nonempty_string, + 'extData/tabview-ui/': is_nonempty_string, + 'height/': is_int_ish, + 'screenX/': is_int_ish, + 'screenY/': is_int_ish, + 'selected/': is_positive_int, + 'sizemode/': lambda x: isinstance(x, unicode), + 'title/': is_nonempty_string, + 'width/': is_int_ish, +} + +#TODO: Finish deduplicating this schema definition +SCHEMA = { + '': lambda x: isinstance(x, dict) and x, + '_closedWindows/': lambda x: isinstance(x, list), + '_closedWindows/list/': lambda x: isinstance(x, dict), + '_closedWindows/list/closedAt/': lambda x: isinstance(x, int), + '_closedWindows/list/extData/__SessionManagerWindowId/': + lambda x: isinstance(x, unicode), + 'global/': lambda x: isinstance(x, dict), + 'scratchpads/': lambda x: isinstance(x, list), + 'selectedWindow/': is_natural_int, + 'session/': lambda x: isinstance(x, dict), + 'session/lastUpdate/': is_natural_int, + 'session/recentCrashes/': is_natural_int, + 'session/startTime/': is_natural_int, + 'windows/': lambda x: isinstance(x, list) and x, + 'windows/list/': lambda x: isinstance(x, dict) and x, + 'windows/list/busy/': lambda x: isinstance(x, bool), + 'windows/list/extData/': lambda x: isinstance(x, dict) and x, + 'windows/list/extData/__SessionManagerWindowId/': is_nonempty_string, + 'windows/list/extData/tabview-last-session-group-name/': + is_nonempty_string, + 'windows/list/extData/tabview-visibility/': is_bool_string, + + 'windows/list/tabs/list/disallow/': is_nonempty_string, + 'windows/list/tabs/list/entries/list/cacheKey/': is_positive_int, + 'windows/list/tabs/list/entries/list/children/list/children/': + lambda x: isinstance(x, list), + 'windows/list/tabs/list/entries/list/children/list/children/list/': + lambda x: isinstance(x, dict), + 'windows/list/tabs/list/entries/list/structuredCloneState/': + is_nonempty_string, + 'windows/list/tabs/list/entries/list/structuredCloneVersion/': + is_positive_int, + 'windows/list/tabs/list/extData/': lambda x: isinstance(x, dict), + 'windows/list/tabs/list/extData/tabview-tab/': is_nonempty_string, + 'windows/list/tabs/list/extData/weaveLastUsed/': is_nonempty_string, + 'windows/list/tabs/list/lastAccessed/': is_natural_int, + 'windows/list/tabs/list/pageStyle/': + lambda x: isinstance(x, (unicode, dict)), + 'windows/list/tabs/list/pageStyle/pageStyle/': is_nonempty_string, + 'windows/list/tabs/list/userTypedClear/': is_natural_int, + 'windows/list/tabs/list/userTypedValue/': is_nonempty_string, +} +SCHEMA.update(apply_subschema('_closedWindows/list/', CLOSED_TABS_SCHEMA)) +SCHEMA.update(apply_subschema('_closedWindows/list/', TABS_SCHEMA)) +SCHEMA.update(apply_subschema('_closedWindows/list/', WINDOW_SCHEMA)) +SCHEMA.update(apply_subschema('windows/list/', CLOSED_TABS_SCHEMA)) +SCHEMA.update(apply_subschema('windows/list/', TABS_SCHEMA)) +SCHEMA.update(apply_subschema('windows/list/', WINDOW_SCHEMA)) + +#}}} +#{{{ Schema-related routines + +def make_schema_line(path, data): + """Generate a draft schema line to be copy-pasted""" + tmp = "'%s': lambda x: isinstance(x, %s)," % (path, type(data).__name__) + return '%-100s # %s' % (tmp, repr(data)[:80]) + +def check_schema(data, dom_path='', schema=None, make_schema=False): + """Recursive exploration for JSON via schemas""" + schema = schema or {} + result = {} + + def fail(msg): + """Unified failure message""" + dump = result[dom_path] = make_schema_line(dom_path, data) + if isinstance(data, (list, dict)): + raise ValueError("%s: %s @ %s\n %s" % ( + msg, type(data), dom_path, dump)) + else: + raise ValueError("%s: %s(%s) @ %s\n %s" % ( + msg, type(data), data, dom_path, dump)) + + if dom_path not in schema: + if make_schema: + result[dom_path] = make_schema_line(dom_path, data) + else: + fail("Unexpected element") + + if make_schema or schema[dom_path](data): + if isinstance(data, list): + for x in data: + result.update(check_schema(x, dom_path + 'list/', + schema=schema, make_schema=make_schema)) + elif isinstance(data, dict): + for x in data: + result.update(check_schema(data[x], '%s%s/' % (dom_path, x), + schema=schema, make_schema=make_schema)) + elif not (data is None or isinstance(data, (int, float, unicode))): + fail("Unexpected data type") + else: + fail("Element failed schema") + + return result + +#}}} + +def _collect_tab_metadata(tab): + """Restructure the tab metadata into a single dict""" + grp_id = None + grp_data = tab.get('extData', {}).get('tabview-tab', {}) + if grp_data: + grp_data = json.loads(grp_data) + if grp_data: + grp_id = grp_data.get('groupID', None) + del grp_data + + empty = { + 'title' : 'New tab', + 'url' : 'about:newtab', + } + if len(tab['entries']) == 0: + content = empty + else: + content = tab['entries'][-1] # -1 is most recent + + return { + 'index': tab.get('index'), + 'group': grp_id, + 'title': content.get('title', content['url']), + 'url': content['url'], + 'favicon': tab.get('image', None), + 'pinned': tab.get('pinned', False) + } + +def dump_tab_groups(data): + """Load a JSON file from a path and extract the relevant data structure.""" + windows = [] + for window in data['windows']: + grp_names = json.loads(window.get('extData', {}).get( + 'tabview-group', '{}')) + grp_names = {int(x): grp_names[x]['title'] for x in grp_names.keys()} + + tabs = [_collect_tab_metadata(tab) for tab in window['tabs']] + + # Group the tabs by group ID and then replace the IDs with the + # group names without risking naming collisions + groups = {} + [groups.setdefault(tab['group'], []).append(tab) for tab in tabs] + groups = [(grp_names.get(k, None), v) for k, v in groups.items()] + #groups.sort() # TODO: Sort case-insensitively + + windows.append(groups) + return windows + +def dump_to_html(dump, for_tiddlywiki=False): + """Convert `dump_tab_groups` output to HTML + + Specifically, an outline represented in HTML via the XOXO microformat + so that it's both human- and machine-readable: + http://www.microformats.org/wiki/xoxo + + @todo: Clean up this code and reuse anything possible to produce an XBEL + output option: + http://en.wikipedia.org/wiki/XBEL + """ + from lxml import etree + from lxml.builder import E + + def attr_class(*args): + """workaround for `class` being a reserved word""" + return {"class": ' '.join(args)} + + h3_prefix = lambda: (E.span('!!!', attr_class('copy-only')) + if for_tiddlywiki else '') + + noicon = PLACEHOLDER_FAVICON_URI + + tab_count, windows = 0, [] + for pos, window in enumerate(dump): + k_toc = [] + e_groups = [] + + for group, tabs in window: + if not group and all(x.get('pinned') for x in tabs): + grp_name = "" + else: + grp_name = group or '' + grp_key = (grp_name, bad_anchor_char_re.sub('_', grp_name).lower()) + + grp_key_actual, idx = grp_key, 0 + while grp_key_actual in k_toc: + idx += 1 + grp_key_actual = '%s%s' % (grp_key, idx) + k_toc.append(grp_key_actual) + + tab_count += len(tabs) + if for_tiddlywiki: + lines = [] + for x in tabs: + lines.extend(['* [[', E.b(x['title']), '|', + E.a(x['url'], href=x['url']), + ']]\n']) + e_tabs = E.pre(*lines) + else: + e_tabs = E.ul(*[E.li(E.a(E.img(src=x['favicon'] or noicon, + alt='', width='16', height='16'), + x['title'], href=x['url'])) for x in tabs]) + + e_groups.append(E.li( + E.h3(h3_prefix(), grp_name, id=grp_key_actual[1]), + e_tabs + )) + + windows.append(E.li(E.h2("Window %d" % pos), *e_groups)) + + e_toc = E.ul(*[E.li(E.a(x[0], href='#%s' % x[1])) for x in k_toc]) + + title_str = '%s (%s tabs total)' % (HTML_EXPORT_TITLE, tab_count) + return etree.tostring(E.html( + E.head( + E.title(title_str), + E.style(HTML_EXPORT_STYLE)), + E.body( + E.h1(title_str), e_toc, + E.ul(attr_class('xoxo'), *windows)))) + + +def dump_to_text(dump): + """Convert `dump_tab_groups` output to text + """ + tab_count, windows = 0, [] + text = '' + for pos, window in enumerate(dump): + text += "Window {0}\n".format(pos) + + for group, tabs in window: + if not group and all(x.get('pinned') for x in tabs): + grp_name = "" + else: + grp_name = group or '' + text += "{0}\n".format(grp_name) + tab_count += len(tabs) + for x in tabs: + text += "\t{1}\n{0}\n".format(x['url'], x['title']) + text += "\n" + + text += '({0} tabs total)'.format(tab_count) + + return text + +# From https://gist.github.com/Tblue/62ff47bef7f894e92ed5 +def decompress(file_obj): + import lz4.block + if file_obj.read(8) != b"mozLz40\0": + raise "Invalid magic number" + return lz4.block.decompress(file_obj.read()) + + +if __name__ == '__main__': + from optparse import OptionParser + parser = OptionParser(version="%%prog v%s" % __version__, + usage="%prog [opts] [path]", + description=__doc__.replace('\r\n', '\n').split('\n--snip--\n')[0]) + parser.add_option('-v', '--verbose', action="count", dest="verbose", + default=2, help="Increase the verbosity. Use twice for extra effect") + parser.add_option('-q', '--quiet', action="count", dest="quiet", + default=0, help="Decrease the verbosity. Use twice for extra effect") + parser.add_option('--check-schema', action="store_true", + dest="check_schema", default=False, help="Check JSON against the " + "embedded sessionstore schema") + parser.add_option('--make-schema', action="store_true", dest="make_schema", + default=False, help="Auto-generate a first draft schema from the JSON") + + # Allow pre-formatted descriptions + parser.formatter.format_description = lambda description: description + + opts, args = parser.parse_args() + + # Set up clean logging to stderr + log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING, + logging.INFO, logging.DEBUG] + opts.verbose = min(opts.verbose - opts.quiet, len(log_levels) - 1) + opts.verbose = max(opts.verbose, 0) + logging.basicConfig(level=log_levels[opts.verbose], + format='%(levelname)s: %(message)s') + + if not args: + import glob + args = glob.glob(os.path.expanduser( + '~/.mozilla/firefox/*/sessionstore-backups/recovery.jsonlz4')) + + results = [] + for arg in args: + with open(arg, 'rb') as fobj: + data = decompress(fobj) + data = json.loads(data) + + if opts.check_schema: + result = check_schema(data, schema=SCHEMA) + elif opts.make_schema: + result = check_schema(data, schema=SCHEMA, make_schema=True) + print("---------------") + print('\n'.join(sorted(result.values()))) + else: + results += dump_tab_groups(data) + text = dump_to_text(results) + print(text) diff --git a/get_firefox_tabs.py b/get_firefox_tabs.py deleted file mode 100755 index 60a37b3..0000000 --- a/get_firefox_tabs.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/python -# -# Taken in part from https://gist.github.com/ssokolow/35097553e5173935e597 - -import glob, json, os, re - -def get_tab_metadata(tab): - grp_id = None - grp_data = tab.get('extData', {}).get('tabview-tab', {}) - if grp_data: - grp_data = json.loads(grp_data) - if grp_data: - grp_id = grp_data.get('groupID', None) - del grp_data - - content = tab['entries'][-1] # -1 is most recent - return { - 'index' : tab.get('index'), - 'group' : grp_id, - 'title' : content.get('title', content.get('url')), - 'url' : content.get('url'), - 'pinned': tab.get('pinned', False) - } - - -def get_tabgroups(data): - windows = [] - for window in data['windows']: - grp_names = json.loads(window.get('extData', {}).get( - 'tabview-group', '{}')) - grp_names = {int(x): grp_names[x]['title'] for x in grp_names.keys()} - - tabs = [] - for tab in window['tabs']: - tabs.append(get_tab_metadata(tab)) - - # Group the tabs by group ID and then replace the IDs with the - # group names without risking naming collisions - groups = {} - for tab in tabs: - groups.setdefault(tab['group'], []).append(tab) - groups = [(grp_names.get(k, None), v) for k, v in groups.items()] - groups.sort() # TODO: Sort case-insensitively - - windows.append(groups) - return windows - - -def print_tabs(groups): - for pos, window in enumerate(groups): - #print 'Window ' + str(pos+1) - for group, tabs in window: - if not group and all(x.get('pinned') for x in tabs): - grp_name = 'Window ' + str(pos+1) + ' | Pinned Tabs' - else: - grp_name = group or 'Unnamed' - print 'Window ' + str(pos+1) + ' | Group: ' + grp_name - for x in tabs: - print '\t' + x['title'] - print '\t' + x['url'] - print - - -filenames = glob.glob(os.path.expanduser('~/.mozilla/firefox/*/sessionstore-backups/recovery.js')) -for filename in filenames: - with open(filename, 'r') as fobj: - data = json.load(fobj) - groups = get_tabgroups(data) - print_tabs(groups) - -# vim: expandtab shiftwidth=4 softtabstop=4 tw=500