#!/usr/bin/env python3 """ piwigo_export.py — Export every Piwigo photo with a JSON metadata sidecar. Directory structure mirrors the album hierarchy; photos that belong to multiple albums are copied into each album folder. Photos with no album membership go into _unsorted/. Optionally embeds metadata directly into the exported image copy via exiftool (requires: apt install libimage-exiftool-perl or brew install exiftool). Usage examples -------------- # Export with JSON sidecars only: python3 piwigo_export.py \ --dbhost localhost --dbuser piwigo --dbpassword secret --dbname piwigo \ --src-path /var/www/piwigo --output-dir ./export # Also embed as XMP tags in the exported copy: python3 piwigo_export.py ... --metadata xmp # Embed all three metadata formats at once: python3 piwigo_export.py ... --metadata exif iptc xmp """ import argparse import json import math import os import pathlib import shutil import subprocess import sys from contextlib import closing import pymysql # --------------------------------------------------------------------------- # Database — bulk loaders (one query per table, not one per image) # --------------------------------------------------------------------------- def load_categories(connection, prefix): """Return {id: row} for every category.""" with closing(connection.cursor()) as cur: cur.execute(f'SELECT * FROM `{prefix}categories`') return {row['id']: row for row in cur} def load_all_tags_by_image(connection, prefix): """Return {image_id: [tag_name, ...]} for the whole library.""" result: dict[int, list[str]] = {} with closing(connection.cursor()) as cur: cur.execute( f'SELECT it.image_id, t.name' f' FROM `{prefix}image_tag` it' f' JOIN `{prefix}tags` t ON it.tag_id = t.id' ) for row in cur: result.setdefault(row['image_id'], []).append(row['name']) return result def load_all_categories_by_image(connection, prefix): """Return {image_id: [category_id, ...]} for the whole library.""" result: dict[int, list[int]] = {} with closing(connection.cursor()) as cur: cur.execute( f'SELECT image_id, category_id FROM `{prefix}image_category`' ) for row in cur: result.setdefault(row['image_id'], []).append(row['category_id']) return result # --------------------------------------------------------------------------- # Category path helpers # --------------------------------------------------------------------------- def category_display_path(cat_id, categories): """Return a human-readable path like 'Holidays / France / Normandy'.""" parts = [] seen: set[int] = set() cid = cat_id while cid is not None and cid not in seen: seen.add(cid) cat = categories.get(cid) if cat is None: break parts.append(cat['name']) cid = cat.get('id_uppercat') parts.reverse() return ' / '.join(parts) def category_fs_path(cat_id, categories): """Return a pathlib.Path for the album's place in the output tree.""" parts = [] seen: set[int] = set() cid = cat_id while cid is not None and cid not in seen: seen.add(cid) cat = categories.get(cid) if cat is None: break parts.append(_safe_dirname(cat['name'])) cid = cat.get('id_uppercat') parts.reverse() return pathlib.Path(*parts) if parts else pathlib.Path('_root') def _safe_dirname(name: str) -> str: """Replace characters that are awkward in directory names.""" for ch in ('/', '\\', '\0', ':'): name = name.replace(ch, '_') return name.strip() or '_unnamed' # --------------------------------------------------------------------------- # Metadata embedding via exiftool # --------------------------------------------------------------------------- # IPTC IIM maximum byte lengths for string fields we write. # exiftool silently truncates to these limits, so we apply them ourselves # first — otherwise a re-run would see a spurious collision between the # full Piwigo value and the already-truncated on-disk value. _IPTC_MAX_BYTES: dict[str, int] = { 'IPTC:ObjectName': 64, 'IPTC:By-line': 32, 'IPTC:Caption-Abstract': 2000, 'IPTC:Keywords': 64, # per keyword 'IPTC:SupplementalCategories': 32, # per entry } def _iptc_truncate(tag: str, value: str) -> str: """Normalise *value* for storage in *tag*: strip whitespace (exiftool does this on write) then truncate to the IPTC byte limit (UTF-8 aware).""" value = value.strip() limit = _IPTC_MAX_BYTES.get(tag) if limit is None: return value encoded = value.encode('utf-8') if len(encoded) <= limit: return value # Truncate on a UTF-8 character boundary. return encoded[:limit].decode('utf-8', errors='ignore') # Tags whose values are always lists (multi-value fields). _LIST_TAGS = { 'IPTC:Keywords', 'IPTC:SupplementalCategories', 'XMP-dc:Subject', 'XMP-dc:Creator', 'XMP-lr:HierarchicalSubject', } # GPS tags use floating-point; compare with a tolerance instead of string equality. # (1e-5 degrees ≈ 1 metre on the ground — more than enough.) _GPS_TAGS = {'GPS:GPSLatitude', 'GPS:GPSLongitude'} def check_exiftool(): if shutil.which('exiftool') is None: sys.exit( 'ERROR: exiftool not found on PATH.\n' ' Install it with: apt install libimage-exiftool-perl\n' ' or: brew install exiftool\n' 'Then re-run, or omit --metadata.' ) def _exif_datetime(s) -> str: """'YYYY-MM-DD[ HH:MM:SS]' → 'YYYY:MM:DD HH:MM:SS' (EXIF format).""" s = str(s) date = s[:10].replace('-', ':') time = s[11:19] if len(s) > 10 else '00:00:00' return f'{date} {time}' def _iptc_date(s) -> str: """'YYYY-MM-DD[ ...]' → 'YYYYMMDD'.""" return str(s)[:10].replace('-', '') def _iptc_time(s) -> str: """'YYYY-MM-DD HH:MM:SS' → 'HHMMSS+0000'.""" s = str(s) t = s[11:19] if len(s) > 10 else '00:00:00' return t.replace(':', '') + '+0000' def _xmp_datetime(s) -> str: """'YYYY-MM-DD[ HH:MM:SS]' → 'YYYY-MM-DDTHH:MM:SS'.""" s = str(s) t = s[11:19] if len(s) > 10 else '00:00:00' return f'{s[:10]}T{t}' def _build_metadata_tags(metadata: dict, fmt: str) -> dict: """ Build a dict of { 'GROUP:TagName': value } for everything we want to write. List-valued tags (Keywords, Subject, …) use Python lists as the value. Scalar tags use a single string/number. """ tags: dict = {} title = metadata.get('title') or '' author = metadata.get('author') or '' description = metadata.get('description') or '' kw_list = metadata.get('tags') or [] albums = metadata.get('albums') or [] date_str = metadata.get('date_created') rating = metadata.get('rating') lat = metadata.get('latitude') lon = metadata.get('longitude') if fmt == 'exif': if title: tags['EXIF:ImageDescription'] = title if author: tags['EXIF:Artist'] = author if description: tags['EXIF:UserComment'] = description if date_str: dt = _exif_datetime(date_str) tags['EXIF:DateTimeOriginal'] = dt tags['EXIF:CreateDate'] = dt elif fmt == 'iptc': if title: tags['IPTC:ObjectName'] = _iptc_truncate('IPTC:ObjectName', title) if author: tags['IPTC:By-line'] = _iptc_truncate('IPTC:By-line', author) if description: tags['IPTC:Caption-Abstract'] = _iptc_truncate('IPTC:Caption-Abstract', description) if date_str: tags['IPTC:DateCreated'] = _iptc_date(date_str) tags['IPTC:TimeCreated'] = _iptc_time(date_str) if kw_list: tags['IPTC:Keywords'] = [_iptc_truncate('IPTC:Keywords', k) for k in kw_list] if albums: tags['IPTC:SupplementalCategories'] = [_iptc_truncate('IPTC:SupplementalCategories', a) for a in albums] elif fmt == 'xmp': if title: tags['XMP-dc:Title'] = title if author: tags['XMP-dc:Creator'] = [author] # XMP Creator is a list if description: tags['XMP-dc:Description'] = description if date_str: tags['XMP-xmp:CreateDate'] = _xmp_datetime(date_str) if kw_list: tags['XMP-dc:Subject'] = list(kw_list) if albums: tags['XMP-lr:HierarchicalSubject'] = list(albums) if rating is not None: tags['XMP-xmp:Rating'] = int(round(rating)) else: raise ValueError(f'Unknown metadata format: {fmt!r}') # GPS is written to the EXIF GPS IFD regardless of which metadata format # was chosen — it is the most universally readable location. if lat is not None and lon is not None: tags['GPS:GPSLatitude'] = abs(lat) tags['GPS:GPSLatitudeRef'] = 'N' if lat >= 0 else 'S' tags['GPS:GPSLongitude'] = abs(lon) tags['GPS:GPSLongitudeRef'] = 'E' if lon >= 0 else 'W' return tags def _read_existing_metadata(image_path: pathlib.Path) -> dict: """ Return all metadata currently in *image_path* as { 'GROUP:Tag': value }. Flags used: -G prefix every key with its group name (e.g. 'EXIF:', 'GPS:') -n return numeric values as numbers (avoids degree-string formatting for GPS, avoids localised number formats, etc.) -j JSON output """ result = subprocess.run( ['exiftool', '-json', '-G', '-n', str(image_path)], capture_output=True, text=True, ) if result.returncode != 0: print( f'WARNING: could not read metadata from {image_path}: ' f'{result.stderr.strip()}', file=sys.stderr, ) return {} try: data = json.loads(result.stdout) return data[0] if data else {} except (json.JSONDecodeError, IndexError): return {} def _is_repeated_char(s: str, min_reps: int = 10) -> bool: """Return True if *s* consists of a single character repeated at least *min_reps* times (e.g. '??????????', '----------', ' ').""" s = str(s) return len(s) >= min_reps and len(set(s)) == 1 def _values_equal(tag: str, existing, desired) -> bool: """Return True if existing and desired values are effectively the same.""" if tag in _GPS_TAGS: try: return math.isclose(float(existing), float(desired), rel_tol=1e-5) except (TypeError, ValueError): pass return str(existing).strip() == str(desired).strip() def _filter_tags( desired: dict, existing: dict, image_path: pathlib.Path, never_overwrite: bool = False, ) -> dict: """ Compare desired tags against what is already embedded in the file and return only the tags that need to be written. Rules ----- Scalar tags: • Not present in file → include for writing. • Present, same value → skip silently. • Present, different → overwrite if the existing value is empty or a repeated-character placeholder; otherwise prompt the user (unless *never_overwrite* is True, in which case the existing value is always kept). List tags (Keywords, Subject, …): • Each item is checked individually. • Items already present in the file's list are silently skipped. • Items not yet present are queued for writing. • No collision error — lists are additive by nature. """ to_write: dict = {} for tag, new_value in desired.items(): existing_value = existing.get(tag) if tag in _LIST_TAGS: new_items = new_value if isinstance(new_value, list) else [new_value] if existing_value is None: to_write[tag] = new_items else: ex_list = ( [str(v).strip() for v in existing_value] if isinstance(existing_value, list) else [str(existing_value).strip()] ) to_add = [v for v in new_items if str(v).strip() not in ex_list] if to_add: to_write[tag] = to_add else: # scalar tag if existing_value is None: to_write[tag] = new_value elif _values_equal(tag, existing_value, new_value): pass # already there with the same value — nothing to do elif never_overwrite: pass # keep existing value, skip silently elif str(existing_value).strip() == '': # Existing value is empty — silently replace with Piwigo value. to_write[tag] = new_value elif _is_repeated_char(existing_value): # Existing value is a placeholder (e.g. '???????????') — # silently replace it with the Piwigo value. to_write[tag] = new_value else: print( f'\nMetadata collision in {image_path}:\n' f' tag : {tag}\n' f' existing : {existing_value!r}\n' f' Piwigo : {new_value!r}', file=sys.stderr, ) while True: choice = input( ' Use Piwigo value? [y/N] ' ).strip().lower() if choice in ('n', 'no', ''): break # leave this tag out of to_write if choice in ('y', 'yes'): to_write[tag] = new_value break print(' Please enter y or n.') return to_write def _tags_to_exiftool_args(tags: dict) -> list[str]: """Convert { 'GROUP:Tag': value } back into exiftool -TAG=VALUE strings.""" args: list[str] = [] for tag, value in tags.items(): if isinstance(value, list): for item in value: args.append(f'-{tag}={item}') else: args.append(f'-{tag}={value}') return args def embed_metadata( dest_image: pathlib.Path, metadata: dict, fmt: str | list[str], never_overwrite: bool = False, ): """ Read the image's existing metadata, check for conflicts with what Piwigo knows, then write only the tags that are new or not yet present. *fmt* may be a single format string or a list of format strings; when multiple formats are given their tag dicts are merged before writing so that only one exiftool invocation is needed. If *never_overwrite* is True, tags that already exist in the file are always kept as-is, with no prompt. """ formats = [fmt] if isinstance(fmt, str) else fmt desired: dict = {} for f in formats: desired.update(_build_metadata_tags(metadata, f)) if not desired: return existing = _read_existing_metadata(dest_image) to_write = _filter_tags(desired, existing, dest_image, never_overwrite) if not to_write: return # every tag was already present with the correct value cmd = ( ['exiftool', '-overwrite_original'] + _tags_to_exiftool_args(to_write) + [str(dest_image)] ) result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print( f' WARNING: exiftool failed for {dest_image.name}:\n' f' {result.stderr.strip()}', file=sys.stderr, ) # --------------------------------------------------------------------------- # Core export for a single image # --------------------------------------------------------------------------- def export_image( image_row: dict, tags_by_image: dict, cats_by_image: dict, categories: dict, src_path: pathlib.Path, output_dir: pathlib.Path, metadata_format: list[str] | None, overwrite: bool, never_overwrite_metadata: bool = False, ) -> int: """ Copy the image (and its JSON sidecar) to every destination album folder. Returns the number of image files actually written. """ src_file = src_path / image_row['path'] if not src_file.is_file(): print(f'WARNING: source file not found: {src_file}', file=sys.stderr) return 0 image_id = image_row['id'] tags = tags_by_image.get(image_id, []) cat_ids = cats_by_image.get(image_id, []) metadata = { 'title': image_row.get('name'), 'author': image_row.get('author'), 'date_created': str(image_row['date_creation']) if image_row.get('date_creation') else None, 'date_added': str(image_row['date_available']) if image_row.get('date_available') else None, 'description': image_row.get('comment'), 'tags': tags, 'albums': [category_display_path(cid, categories) for cid in cat_ids], 'width': image_row.get('width'), 'height': image_row.get('height'), 'filesize': image_row.get('filesize'), 'latitude': float(image_row['latitude']) if image_row.get('latitude') else None, 'longitude': float(image_row['longitude']) if image_row.get('longitude') else None, 'rating': float(image_row['rating_score']) if image_row.get('rating_score') else None, 'original_path': image_row['path'], } dest_dirs = ( [output_dir / category_fs_path(cid, categories) for cid in cat_ids] if cat_ids else [output_dir / '_unsorted'] ) filename = pathlib.Path(image_row['path']).name stem = pathlib.Path(filename).stem written = 0 for dest_dir in dest_dirs: dest_dir.mkdir(parents=True, exist_ok=True) dest_image = dest_dir / filename dest_sidecar = dest_dir / f'{stem}.json' # Collision check: would we overwrite a file from a *different* source? if dest_image.exists() and dest_sidecar.exists(): try: existing = json.loads(dest_sidecar.read_text(encoding='utf-8')) if existing.get('original_path') != image_row['path']: raise RuntimeError( f"Filename collision at {dest_image}:\n" f" already written from : {existing.get('original_path')}\n" f" now requested from : {image_row['path']}\n" f"Use --overwrite to ignore (the second file will replace the first)." ) except json.JSONDecodeError: pass # corrupted sidecar — let the overwrite logic decide # Skip if both files are already present (and --overwrite not set). if dest_image.exists() and dest_sidecar.exists() and not overwrite: print(f' SKIP (both files exist, use --overwrite to replace): {dest_image}') continue # Copy image file shutil.copy2(str(src_file), str(dest_image)) written += 1 if metadata_format: embed_metadata(dest_image, metadata, metadata_format, never_overwrite_metadata) # Write/refresh the sidecar so it stays in sync with the DB. dest_sidecar.write_text( json.dumps(metadata, indent=2, ensure_ascii=False, default=str), encoding='utf-8', ) return written # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description='Export Piwigo photos with JSON metadata sidecars.', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) db = parser.add_argument_group('database') db.add_argument('--dbhost', metavar='HOST') db.add_argument('--dbuser', required=True, metavar='USER') db.add_argument('--dbpassword', metavar='PASS') db.add_argument('--dbname', required=True, metavar='NAME') db.add_argument( '--db-prefix', default='piwigo_', metavar='PREFIX', help='Piwigo table prefix (default: %(default)s)', ) io = parser.add_argument_group('paths') io.add_argument( '--src-path', required=True, metavar='DIR', help='Root of the Piwigo installation; piwigo_images.path is relative to this.', ) io.add_argument( '--output-dir', required=True, metavar='DIR', help='Directory to write exported files into (created if absent).', ) behaviour = parser.add_argument_group('behaviour') behaviour.add_argument( '--metadata', choices=['exif', 'iptc', 'xmp'], nargs='+', metavar='FORMAT', help='Also embed metadata into the exported image copy using exiftool. ' 'One or more of: exif, iptc, xmp. ' 'Example: --metadata exif iptc xmp', ) behaviour.add_argument( '--overwrite', action='store_true', help='Re-export image files that already exist in the output directory. ' 'JSON sidecars are always refreshed.', ) behaviour.add_argument( '--no-overwrite-metadata', action='store_true', help='When embedding metadata, never overwrite a tag that already has a ' 'value in the file — skip it silently instead of prompting.', ) args = parser.parse_args() if args.metadata: check_exiftool() src_path = pathlib.Path(args.src_path) output_dir = pathlib.Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) print(f'Connecting to {args.dbuser}@{args.dbhost}/{args.dbname} …') connection = pymysql.connect( host=args.dbhost, user=args.dbuser, password=args.dbpassword, database=args.dbname, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor, ) prefix = args.db_prefix print('Loading category tree …') categories = load_categories(connection, prefix) print(f' {len(categories)} categories.') print('Loading tag assignments …') tags_by_image = load_all_tags_by_image(connection, prefix) print(f' tags for {len(tags_by_image)} images.') print('Loading album memberships …') cats_by_image = load_all_categories_by_image(connection, prefix) print(f' memberships for {len(cats_by_image)} images.') print('Exporting images …') total_images = 0 total_written = 0 with closing(connection.cursor()) as cur: cur.execute( f'SELECT id, file, path, name, comment, author,' f' date_creation, date_available,' f' width, height, filesize,' f' latitude, longitude, rating_score' f' FROM `{prefix}images`' ) for image_row in cur: total_images += 1 total_written += export_image( image_row, tags_by_image, cats_by_image, categories, src_path, output_dir, args.metadata, args.overwrite, args.no_overwrite_metadata, ) if total_images % 100 == 0: print(f' … {total_images} processed, {total_written} written so far') connection.close() print( f'\nDone. {total_images} images processed, ' f'{total_written} image files written to {output_dir}/' ) if __name__ == '__main__': main()