#!/usr/bin/env python # encoding: utf-8 """ sortphotos.py Created on 3/2/2013 Copyright (c) S. Andrew Ning. All rights reserved. """ from __future__ import print_function from __future__ import with_statement import subprocess import os import sys import shutil try: import json except: import simplejson as json import filecmp from datetime import datetime, timedelta import re import locale # Setting locale to the 'local' value locale.setlocale(locale.LC_ALL, '') exiftool_location = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Image-ExifTool', 'exiftool') # -------- convenience methods ------------- def parse_date_exif(date_string): """ extract date info from EXIF data YYYY:MM:DD HH:MM:SS or YYYY:MM:DD HH:MM:SS+HH:MM or YYYY:MM:DD HH:MM:SS-HH:MM or YYYY:MM:DD HH:MM:SSZ """ # split into date and time elements = str(date_string).strip().split() # ['YYYY:MM:DD', 'HH:MM:SS'] if len(elements) < 1: return None # parse year, month, day date_entries = elements[0].split(':') # ['YYYY', 'MM', 'DD'] # check if three entries, nonzero data, and no decimal (which occurs for timestamps with only time but no date) if len(date_entries) == 3 and date_entries[0] > '0000' and '.' not in ''.join(date_entries): year = int(date_entries[0]) month = int(date_entries[1]) day = int(date_entries[2]) else: return None # parse hour, min, second time_zone_adjust = False hour = 12 # defaulting to noon if no time data provided minute = 0 second = 0 if len(elements) > 1: time_entries = re.split('(\+|-|Z)', elements[1]) # ['HH:MM:SS', '+', 'HH:MM'] time = time_entries[0].split(':') # ['HH', 'MM', 'SS'] if len(time) == 3: hour = int(time[0]) minute = int(time[1]) second = int(time[2].split('.')[0]) elif len(time) == 2: hour = int(time[0]) minute = int(time[1]) # adjust for time-zone if needed if len(time_entries) > 2: time_zone = time_entries[2].split(':') # ['HH', 'MM'] if len(time_zone) == 2: time_zone_hour = int(time_zone[0]) time_zone_min = int(time_zone[1]) # check if + or - if time_entries[1] == '+': time_zone_hour *= -1 dateadd = timedelta(hours=time_zone_hour, minutes=time_zone_min) time_zone_adjust = True # form date object try: date = datetime(year, month, day, hour, minute, second) except ValueError: return None # errors in time format # try converting it (some "valid" dates are way before 1900 and cannot be parsed by strtime later) try: date.strftime('%Y/%m-%b') # any format with year, month, day, would work here. except ValueError: return None # errors in time format # adjust for time zone if necessary if time_zone_adjust: date += dateadd return date def get_oldest_timestamp(data, additional_groups_to_ignore, additional_tags_to_ignore, print_all_tags=False): """data as dictionary from json. Should contain only time stamps except SourceFile""" # save only the oldest date date_available = False oldest_date = datetime.now() oldest_keys = [] # save src file src_file = data['SourceFile'] # ssetup tags to ignore ignore_groups = ['ICC_Profile'] + additional_groups_to_ignore ignore_tags = ['SourceFile', 'XMP:HistoryWhen'] + additional_tags_to_ignore if print_all_tags: print('All relevant tags:') # run through all keys for key in data.keys(): # check if this key needs to be ignored, or is in the set of tags that must be used if (key not in ignore_tags) and (key.split(':')[0] not in ignore_groups) and 'GPS' not in key: date = data[key] if print_all_tags: print(str(key) + ', ' + str(date)) # (rare) check if multiple dates returned in a list, take the first one which is the oldest if isinstance(date, list): date = date[0] try: exifdate = parse_date_exif(date) # check for poor-formed exif data, but allow continuation except Exception as e: exifdate = None if exifdate and exifdate < oldest_date: date_available = True oldest_date = exifdate oldest_keys = [key] elif exifdate and exifdate == oldest_date: oldest_keys.append(key) if not date_available: oldest_date = None if print_all_tags: print() return src_file, oldest_date, oldest_keys def check_for_early_morning_photos(date, day_begins): """check for early hour photos to be grouped with previous day""" if date.hour < day_begins: print('moving this photo to the previous day for classification purposes (day_begins=' + str(day_begins) + ')') date = date - timedelta(hours=date.hour+1) # push it to the day before for classificiation purposes return date # this class is based on code from Sven Marnach (http://stackoverflow.com/questions/10075115/call-exiftool-from-a-python-script) class ExifTool(object): """used to run ExifTool from Python and keep it open""" sentinel = "{ready}" def __init__(self, executable=exiftool_location, verbose=False): self.executable = executable self.verbose = verbose def __enter__(self): self.process = subprocess.Popen( ['perl', self.executable, "-stay_open", "True", "-@", "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return self def __exit__(self, exc_type, exc_value, traceback): self.process.stdin.write(b'-stay_open\nFalse\n') self.process.stdin.flush() def execute(self, *args): args = args + ("-execute\n",) self.process.stdin.write(str.join("\n", args).encode('utf-8')) self.process.stdin.flush() output = "" fd = self.process.stdout.fileno() while not output.rstrip(' \t\n\r').endswith(self.sentinel): increment = os.read(fd, 4096) # try: if/tbo-hxtp-dem self.verbose: sys.stdout.write(increment.decode('utf-8')) output += increment.decode('utf-8') # except: # continue return output.rstrip(' \t\n\r')[:-len(self.sentinel)] def get_metadata(self, *args): try: return json.loads(self.execute(*args)) except ValueError as e: sys.stdout.write('No files to parse or invalid data\n') sys.stdout.write(e) exit() # --------------------------------------- def sortPhotos(src_dir, dest_dir, sort_format, rename_format, recursive=False, copy_files=False, test=False, remove_duplicates=True, day_begins=0, additional_groups_to_ignore=['File'], additional_tags_to_ignore=[], use_only_groups=None, use_only_tags=None, verbose=True, keep_filename=False): """ This function is a convenience wrapper around ExifTool based on common usage scenarios for sortphotos.py Parameters --------------- src_dir : str directory containing files you want to process dest_dir : str directory where you want to move/copy the files to sort_format : str date format code for how you want your photos sorted (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior) rename_format : str date format code for how you want your files renamed (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior) None to not rename file recursive : bool True if you want src_dir to be searched recursively for files (False to search only in top-level of src_dir) copy_files : bool True if you want files to be copied over from src_dir to dest_dir rather than moved test : bool True if you just want to simulate how the files will be moved without actually doing any moving/copying remove_duplicates : bool True to remove files that are exactly the same in name and a file hash keep_filename : bool True to append original filename in case of duplicates instead of increasing number day_begins : int what hour of the day you want the day to begin (only for classification purposes). Defaults at 0 as midnight. Can be used to group early morning photos with the previous day. must be a number between 0-23 additional_groups_to_ignore : list(str) tag groups that will be ignored when searching for file data. By default File is ignored additional_tags_to_ignore : list(str) specific tags that will be ignored when searching for file data. use_only_groups : list(str) a list of groups that will be exclusived searched across for date info use_only_tags : list(str) a list of tags that will be exclusived searched across for date info verbose : bool True if you want to see details of file processing """ # some error checking if not os.path.exists(src_dir): raise Exception('Source directory does not exist') # setup arguments to exiftool args = ['-j', '-a', '-G'] # setup tags to ignore if use_only_tags is not None: additional_groups_to_ignore = [] additional_tags_to_ignore = [] for t in use_only_tags: args += ['-' + t] elif use_only_groups is not None: additional_groups_to_ignore = [] for g in use_only_groups: args += ['-' + g + ':Time:All'] else: args += ['-time:all'] if recursive: args += ['-r'] args += [src_dir] # get all metadata with ExifTool(verbose=verbose) as e: print('Preprocessing with ExifTool. May take a while for a large number of files.') sys.stdout.flush() metadata = e.get_metadata(*args) # setup output to screen num_files = len(metadata) print() if test: test_file_dict = {} # parse output extracting oldest relevant date for idx, data in enumerate(metadata): # extract timestamp date for photo src_file, date, keys = get_oldest_timestamp(data, additional_groups_to_ignore, additional_tags_to_ignore) # fixes further errors when using unicode characters like "\u20AC" src_file.encode('utf-8') if verbose: # write out which photo we are at ending = ']' if test: ending = '] (TEST - no files are being moved/copied)' print('[' + str(idx+1) + '/' + str(num_files) + ending) print('Source: ' + src_file) else: # progress bar numdots = int(20.0*(idx+1)/num_files) sys.stdout.write('\r') sys.stdout.write('[%-20s] %d of %d ' % ('='*numdots, idx+1, num_files)) sys.stdout.flush() # check if no valid date found if not date: if verbose: print('No valid dates were found using the specified tags. File will remain where it is.') print() # sys.stdout.flush() continue # ignore hidden files if os.path.basename(src_file).startswith('.'): print('hidden file. will be skipped') print() continue if verbose: print('Date/Time: ' + str(date)) print('Corresponding Tags: ' + ', '.join(keys)) # early morning photos can be grouped with previous day (depending on user setting) date = check_for_early_morning_photos(date, day_begins) # create folder structure dir_structure = date.strftime(sort_format) dirs = dir_structure.split('/') dest_file = dest_dir for thedir in dirs: dest_file = os.path.join(dest_file, thedir) if not test and not os.path.exists(dest_file): os.makedirs(dest_file) # rename file if necessary filename = os.path.basename(src_file) if rename_format is not None and date is not None: _, ext = os.path.splitext(filename) filename = date.strftime(rename_format) + ext.lower() # setup destination file dest_file = os.path.join(dest_file, filename) root, ext = os.path.splitext(dest_file) if verbose: name = 'Destination ' if copy_files: name += '(copy): ' else: name += '(move): ' print(name + dest_file) # check for collisions append = 1 fileIsIdentical = False while True: if (not test and os.path.isfile(dest_file)) or (test and dest_file in test_file_dict.keys()): # check for existing name if test: dest_compare = test_file_dict[dest_file] else: dest_compare = dest_file if remove_duplicates and filecmp.cmp(src_file, dest_compare): # check for identical files fileIsIdentical = True if verbose: print('Identical file already exists. Duplicate will be ignored.\n') break else: # name is same, but file is different if keep_filename: orig_filename = os.path.splitext(os.path.basename(src_file))[0] dest_file = root + '_' + orig_filename + '_' + str(append) + ext else: dest_file = root + '_' + str(append) + ext append += 1 if verbose: print('Same name already exists...renaming to: ' + dest_file) else: break # finally move or copy the file if test: test_file_dict[dest_file] = src_file else: if fileIsIdentical: continue # ignore identical files else: if copy_files: shutil.copy2(src_file, dest_file) else: shutil.move(src_file, dest_file) if verbose: print() # sys.stdout.flush() if not verbose: print() def main(): import argparse # setup command line parsing parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description='Sort files (primarily photos and videos) into folders by date\nusing EXIF and other metadata') parser.add_argument('src_dir', type=str, help='source directory') parser.add_argument('dest_dir', type=str, help='destination directory') parser.add_argument('-r', '--recursive', action='store_true', help='search src_dir recursively') parser.add_argument('-c', '--copy', action='store_true', help='copy files instead of move') parser.add_argument('-s', '--silent', action='store_true', help='don\'t display parsing details.') parser.add_argument('-t', '--test', action='store_true', help='run a test. files will not be moved/copied\ninstead you will just a list of would happen') parser.add_argument('--sort', type=str, default='%Y/%m-%b', help="choose destination folder structure using datetime format \n\ https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior. \n\ Use forward slashes / to indicate subdirectory(ies) (independent of your OS convention). \n\ The default is '%%Y/%%m-%%b', which separates by year then month \n\ with both the month number and name (e.g., 2012/02-Feb).") parser.add_argument('--rename', type=str, default=None, help="rename file using format codes \n\ https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior. \n\ default is None which just uses original filename") parser.add_argument('--keep-filename', action='store_true', help='In case of duplicated output filenames an increasing number and the original file name will be appended', default=False) parser.add_argument('--keep-duplicates', action='store_true', help='If file is a duplicate keep it anyway (after renaming).') parser.add_argument('--day-begins', type=int, default=0, help='hour of day that new day begins (0-23), \n\ defaults to 0 which corresponds to midnight. Useful for grouping pictures with previous day.') parser.add_argument('--ignore-groups', type=str, nargs='+', default=[], help='a list of tag groups that will be ignored for date informations.\n\ list of groups and tags here: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/\n\ by default the group \'File\' is ignored which contains file timestamp data') parser.add_argument('--ignore-tags', type=str, nargs='+', default=[], help='a list of tags that will be ignored for date informations.\n\ list of groups and tags here: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/\n\ the full tag name needs to be included (e.g., EXIF:CreateDate)') parser.add_argument('--use-only-groups', type=str, nargs='+', default=None, help='specify a restricted set of groups to search for date information\n\ e.g., EXIF') parser.add_argument('--use-only-tags', type=str, nargs='+', default=None, help='specify a restricted set of tags to search for date information\n\ e.g., EXIF:CreateDate') # parse command line arguments args = parser.parse_args() sortPhotos(args.src_dir, args.dest_dir, args.sort, args.rename, args.recursive, args.copy, args.test, not args.keep_duplicates, args.day_begins, args.ignore_groups, args.ignore_tags, args.use_only_groups, args.use_only_tags, not args.silent, args.keep_filename) if __name__ == '__main__': main()