From 540d2efbd8fb61d2b716309431a3bfb6b7bd9a77 Mon Sep 17 00:00:00 2001 From: Dave Lane Date: Mon, 19 Mar 2012 14:08:24 -0500 Subject: [PATCH 1/5] Added "--exclude" argument to exclude file paths from syncing. --- bin/boto-rsync | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bin/boto-rsync b/bin/boto-rsync index 76278b2..fde1c7e 100644 --- a/bin/boto-rsync +++ b/bin/boto-rsync @@ -209,6 +209,10 @@ def main(): help='Specify a specific S3 endpoint to connect to via boto\'s ' + \ '"host" connection argument (S3 only).' ) + parser.add_argument( + '--exclude', action='append', default=[], + help='Exclude files matching the specified pattern.' + ) parser.add_argument( '-g', '--grant', help='A canned ACL policy that will be granted on each file ' + \ @@ -313,6 +317,7 @@ def main(): cloud_secret_access_key = args.cloud_secret_access_key anon = args.anon endpoint = args.endpoint + exclude = args.exclude grant = args.grant metadata = args.metadata if not isinstance(metadata, dict): @@ -542,6 +547,22 @@ def main(): key_name = cloud_path + get_key_name(fullpath, path) file_size = os.path.getsize(fullpath) + # determine if the file should be excluded according to command line arguments. + excludeFile = False + for excludePath in exclude: + if fullpath.startswith(excludePath): + excludeFile = True + continue + elif fullpath[len(path):].lstrip(os.sep).startswith(excludePath): + excludeFile = True + continue + if excludeFile: + sys.stdout.write( + 'Skipping %s (excluded path)\n' % + fullpath[len(path):].lstrip(os.sep) + ) + continue + if file_size == 0: if ignore_empty: if not quiet: From b399806a327d095dedc913b7748061a475be0987 Mon Sep 17 00:00:00 2001 From: Ken Schweickert Date: Wed, 1 Jul 2015 17:58:03 -0400 Subject: [PATCH 2/5] Make --exclude option more like rsync's with glob syntax --- bin/boto-rsync | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/bin/boto-rsync b/bin/boto-rsync index fde1c7e..8f8cd42 100644 --- a/bin/boto-rsync +++ b/bin/boto-rsync @@ -489,6 +489,21 @@ def main(): else: key_name = cloud_path + get_key_name(root, path) + '/' + # Skip whole directory if matches exclude argument(s) + # (Still checks subdirectories, but saves a little time and verbosity.) + excludeDir = False + for excludePath in exclude: + if fnmatch(root+os.sep, excludePath): + excludeDir = True + break + if excludeDir: + if not quiet: + sys.stdout.write( + 'Skipping %s (excluded path)\n' % + root + ) + continue + if ignore_empty and not files: if not quiet: sys.stdout.write( @@ -550,17 +565,15 @@ def main(): # determine if the file should be excluded according to command line arguments. excludeFile = False for excludePath in exclude: - if fullpath.startswith(excludePath): - excludeFile = True - continue - elif fullpath[len(path):].lstrip(os.sep).startswith(excludePath): + if fnmatch(fullpath, excludePath): excludeFile = True - continue + break if excludeFile: - sys.stdout.write( - 'Skipping %s (excluded path)\n' % - fullpath[len(path):].lstrip(os.sep) - ) + if not quiet: + sys.stdout.write( + 'Skipping %s (excluded path)\n' % + fullpath[len(path):].lstrip(os.sep) + ) continue if file_size == 0: From 0baa119ec1d953f9e9f37c348bac794bd4666ccb Mon Sep 17 00:00:00 2001 From: Ken Schweickert Date: Thu, 2 Jul 2015 02:49:12 -0400 Subject: [PATCH 3/5] Added last-modified comparison after size check to determine what to sync. Traditional rsync looks for equality in last-modified. Since we only have time uploaded to work with, this implementation opts to preserve the more recent copy, similar to rsync --update. I think this is relatively intuitive for uploads, but it could be problematic for downloads if the user is trying to "restore" as opposed to "get up to date" - I'm figuring in the case of the former it's easier to delete the local copy and try again than to recover inadvertently overwriting a file. This also implements a --size-only option like rsync's which can overcome the potential pitfall above. --- bin/boto-rsync | 174 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 119 insertions(+), 55 deletions(-) diff --git a/bin/boto-rsync b/bin/boto-rsync index 8f8cd42..75d3cf8 100644 --- a/bin/boto-rsync +++ b/bin/boto-rsync @@ -24,6 +24,7 @@ import sys, os, time, datetime, argparse, threading, signal from fnmatch import fnmatch import boto +from boto.utils import parse_ts __version__ = '0.8.1' @@ -275,6 +276,11 @@ def main(): 'size is 0. Warning: S3/GS often uses empty keys with special ' + \ 'trailing characters to specify directories.' ) + parser.add_argument( + '--size-only', action='store_true', + help='Only compare size of files in deciding what to transfer, ' + \ + 'ignoring last-modified.' + ) parser.add_argument( '--delete', action='store_true', help='Delete extraneous files from destination dirs after the ' + \ @@ -330,6 +336,7 @@ def main(): no_recurse = args.no_recurse or glob skip_dirkeys = args.skip_dirkeys ignore_empty = args.ignore_empty + size_only = args.size_only delete = args.delete no_op = args.no_op quiet = args.quiet @@ -462,7 +469,7 @@ def main(): if glob and not fnmatch(key.name.split('/')[-1], glob): continue - keys[key.name] = key.size + keys[key.name] = {'size': key.size, 'modified': parse_ts(key.last_modified)} except Exception, e: raise e finally: @@ -523,7 +530,7 @@ def main(): key_name.endswith('_$folder$'): if not quiet: sys.stdout.write( - 'Skipping %s (size matches)\n' % + 'Skipping %s (directory already exists)\n' % key_name.replace('_$folder$', '/') ) create_dirkey = False @@ -549,7 +556,8 @@ def main(): reduced_redundancy=reduced, encrypt_key=encrypt ) - keys[key_name] = 0 + keys[key_name]['size'] = 0 + keys[key_name]['modified'] = datetime.datetime.now() # Clean stdout sys.stdout.write('\n') @@ -593,13 +601,22 @@ def main(): fullpath[len(path):].lstrip(os.sep) ) continue - elif keys[key_name] == file_size: - if not quiet: - sys.stdout.write( - 'Skipping %s (size matches)\n' % - fullpath[len(path):].lstrip(os.sep) - ) - continue + elif keys[key_name]['size'] == file_size: + if size_only: + if not quiet: + sys.stdout.write( + 'Skipping %s (size matches)\n' % + fullpath[len(path):].lstrip(os.sep) + ) + continue + # Compare last modified + elif datetime.datetime.fromtimestamp(os.path.getmtime(fullpath)) <= keys[key_name]['modified']: + if not quiet: + sys.stdout.write( + 'Skipping %s (not modified since last upload)\n' % + fullpath[len(path):].lstrip(os.sep) + ) + continue sys.stdout.write( '%s\n' % @@ -623,7 +640,8 @@ def main(): policy=grant, reduced_redundancy=reduced, encrypt_key=encrypt ) - keys[key_name] = file_size + keys[key_name]['size'] = file_size + keys[key_name]['modified'] = datetime.datetime.now() # Clean stdout sys.stdout.write('\n') @@ -655,7 +673,7 @@ def main(): if key_name in keys: del(keys[key_name]) - for key_name, key_size in keys.iteritems(): + for key_name, key_meta in keys.iteritems(): sys.stdout.write( 'deleting %s\n' % key_name[len(cloud_path):].replace('_$folder$', '/') @@ -698,16 +716,23 @@ def main(): sys.stdout.write('Skipping %s (not overwriting)\n' % filename) elif key.size == file_size: - copy_file = False - if not quiet: - if filename != key_name.split('/')[-1]: - sys.stdout.write( - 'Skipping %s -> %s (size matches)\n' % - filename, key_name.split('/')[-1] - ) - else: - sys.stdout.write('Skipping %s (size matches)\n' % - filename) + skip_reason = '' + if size_only: + skip_reason = 'size matches' + # Compare last modified + elif datetime.datetime.fromtimestamp(os.path.getmtime(path)) <= parse_ts(key.last_modified): + skip_reason = 'not modified since last upload' + if skip_reason != '': + copy_file = False + if not quiet: + if filename != key_name.split('/')[-1]: + sys.stdout.write( + 'Skipping %s -> %s (%s)\n' % + (filename, key_name.split('/')[-1], skip_reason) + ) + else: + sys.stdout.write('Skipping %s (%s)\n' % + (filename, skip_reason)) if copy_file: if filename != key_name.split('/')[-1]: @@ -801,19 +826,27 @@ def main(): ) copy_file = False elif key.size == os.path.getsize(fullpath): - if not quiet: - if rename: - sys.stdout.write( - 'Skipping %s -> %s (size matches)\n' % - keypath.replace('/', os.sep), - fullpath.split(os.sep)[-1] - ) - else: - sys.stdout.write( - 'Skipping %s (size matches)\n' % - fullpath.split(os.sep)[-1] - ) - copy_file = False + skip_reason = '' + if size_only: + skip_reason = 'size matches' + # Compare last modified + elif parse_ts(key.last_modified) < datetime.datetime.fromtimestamp(os.path.getmtime(fullpath)): + skip_reason = 'local copy more recently modified' + if skip_reason != '': + if not quiet: + if rename: + sys.stdout.write( + 'Skipping %s -> %s (%s)\n' % + (keypath.replace('/', os.sep), + fullpath.split(os.sep)[-1], + skip_reason) + ) + else: + sys.stdout.write( + 'Skipping %s (%s)\n' % + (fullpath.split(os.sep)[-1], skip_reason) + ) + copy_file = False if copy_file: if rename: @@ -890,12 +923,27 @@ def main(): fullpath[len(os.path.join(path, '')):] ) continue - elif key.size == os.path.getsize(fullpath) or \ - key.name.endswith('/') or \ + elif key.size == os.path.getsize(fullpath): + if size_only: + if not quiet: + sys.stdout.write( + 'Skipping %s (size matches)\n' % + fullpath[len(os.path.join(path, '')):] + ) + continue + # Compare last modified + elif parse_ts(key.last_modified) < datetime.datetime.fromtimestamp(os.path.getmtime(fullpath)): + if not quiet: + sys.stdout.write( + 'Skipping %s (modified since last upload)\n' % + fullpath[len(path):].lstrip(os.sep) + ) + continue + elif key.name.endswith('/') or \ key.name.endswith('_$folder$'): if not quiet: sys.stdout.write( - 'Skipping %s (size matches)\n' % + 'Skipping %s (directory already exists)\n' % fullpath[len(os.path.join(path, '')):] ) continue @@ -1011,17 +1059,24 @@ def main(): ) copy_file = False elif key.size == dest_key.size: - if not quiet: - if rename: - sys.stdout.write( - 'Skipping %s -> %s (size matches)\n' % - keypath.split('/')[-1], fullpath.split('/')[-1] - ) - else: - sys.stdout.write( - 'Skipping %s (size matches)\n' % fullpath - ) - copy_file = False + skip_reason = '' + if size_only: + skip_reason = 'size matches' + # Compare last modified + elif parse_ts(key.last_modified) < parse_ts(dest_key.last_modified): + skip_reason = 'destination more recently modified' + if skip_reason != '': + if not quiet: + if rename: + sys.stdout.write( + 'Skipping %s -> %s (%s)\n' % + (keypath.split('/')[-1], fullpath.split('/')[-1], skip_reason) + ) + else: + sys.stdout.write( + 'Skipping %s (%s)\n' % (fullpath, skip_reason) + ) + copy_file = False if copy_file: if rename: @@ -1119,12 +1174,21 @@ def main(): ) continue elif key.size == dest_key.size: - if not quiet: - sys.stdout.write( - 'Skipping %s (size matches)\n' % - fullpath.replace('_$folder$', '/') - ) - continue + if size_only: + if not quiet: + sys.stdout.write( + 'Skipping %s (size matches)\n' % + fullpath.replace('_$folder$', '/') + ) + continue + # Compare last modified + elif parse_ts(key.last_modified) < parse_ts(dest_key.last_modified): + if not quiet: + sys.stdout.write( + 'Skipping %s (destination more recently modified)\n' % + fullpath.replace('_$folder$', '/') + ) + continue sys.stdout.write('%s... ' % keypath.replace('_$folder$', '/')) sys.stdout.flush() From 5914a6bea1ff9e599e69b53151dabed428635d72 Mon Sep 17 00:00:00 2001 From: Ken Schweickert Date: Thu, 2 Jul 2015 03:24:25 -0400 Subject: [PATCH 4/5] Updated documentation. --- README.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9ca9466..a799449 100644 --- a/README.rst +++ b/README.rst @@ -121,6 +121,9 @@ Options keys/files whose size is 0. Warning: S3/GS often uses empty keys with special trailing characters to specify directories. + --exclude Exclude files matching the specified pattern. + --size-only Only compare size of files in deciding what to transfer, + ignoring last-modified. --delete Delete extraneous files from destination dirs after the transfer has finished (e.g. rsync's --delete- after). @@ -151,7 +154,6 @@ http://boto.cloudhackers.com/en/latest/boto_config_tut.html Known Issues and Limitations ================================================================================ -* Differences between keys/files are assumed *only* by checking the size. * Due to the nature of how directories work in S3/GS, some non-standard folder structures might not transfer correctly. Empty directories may also be overlooked in some cases. When in doubt, use "-n" first. @@ -159,6 +161,10 @@ Known Issues and Limitations on some systems. See the "--glob" option's help text for more info. * At this time, the script does not take advantage of boto's "multipart" transfer methods. (pull requests welcome!) +* The last-modified time on the cloud platform may really be the time uploaded. + Comparison is done which attempts to preserve the more recent copy (similar to + ``rsync --update``). You can use the ``--size-only`` option if this discrepancy + is problematic. Disclaimers and Warnings From 7e3ae84ef9be2ccdeeb69b5d719749f2b68c61ac Mon Sep 17 00:00:00 2001 From: Ken Schweickert Date: Mon, 6 Jul 2015 03:10:52 -0400 Subject: [PATCH 5/5] bugfix: KeyError when creating new file/directory --- bin/boto-rsync | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/boto-rsync b/bin/boto-rsync index 75d3cf8..42e4297 100644 --- a/bin/boto-rsync +++ b/bin/boto-rsync @@ -556,8 +556,7 @@ def main(): reduced_redundancy=reduced, encrypt_key=encrypt ) - keys[key_name]['size'] = 0 - keys[key_name]['modified'] = datetime.datetime.now() + keys[key_name] = {'size': 0, 'modified': datetime.datetime.now()} # Clean stdout sys.stdout.write('\n') @@ -640,8 +639,7 @@ def main(): policy=grant, reduced_redundancy=reduced, encrypt_key=encrypt ) - keys[key_name]['size'] = file_size - keys[key_name]['modified'] = datetime.datetime.now() + keys[key_name] = {'size': file_size, 'modified': datetime.datetime.now()} # Clean stdout sys.stdout.write('\n')