#!/usr/bin/python

# syncPozo.py -- incremental and full backups of a Data.fs file.
#
# The original version was called rePozo.py
# Originally written by Anthony Baxter
# Significantly modified by Barry Warsaw
# modified by Huub so it can be used to sync to a backup Data.fs file

"""syncPozo.py -- incremental and full backups of a Data.fs file.

Usage: %(program)s [options]
Where:

    -v / --verbose
        Verbose mode.

    -h / --help
        Print this text and exit.

    -m dir
    --master-directory=dir
        Master directory containing the Data.fs file

    -s dir
    --slave-directory=dir
        Slave directory containing the (copy) Data.fs file

    -f filename
    --filename=filename
        Source Data.fs file. Default is Data.fs

    --force-backup
        Force creation of a backup file, even if slave file has grown
        since the last syncing. If this option is not requested, then 
        a manual step is required before the syncing will work after
        such an error is detected (like deleting the slave backup file)

"""

import os
import sys
import md5
import gzip
import time
import errno
import getopt

from ZODB.FileStorage import FileStorage

program = sys.argv[0]

COMMASPACE = ', '
READCHUNK = 16 * 1024
VERBOSE = True

def usage(code, msg=''):
    outfp = sys.stderr
    if code == 0:
        outfp = sys.stdout

    print >> outfp, __doc__ % globals()
    if msg:
        print >> outfp, msg

    sys.exit(code)


def log(msg, *args):
    if VERBOSE:
        # Use stderr here so that -v flag works with -R and no -o
        print >> sys.stderr, msg % args


def parseargs():
    global VERBOSE
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'vhfm:s:',
                                   ['verbose', 'help',
                                    'filename=', 'master-directory=',
                                    'slave-directory=', 'force-backup'])
    except getopt.error, msg:
        usage(1, msg)

    class Options:
        filename = 'Data.fs' # name of input Data.fs file
        master_dir = None    # name of directory holding master Data.fs
        slave_dir = None     # name of directory holding backup Data.fs
        force_backup = False #Force backup even if slave file has grown since last backup

    options = Options()

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-v', '--verbose'):
            VERBOSE = True
        elif opt in ('-f', '--filename'):
            options.filename = arg
        elif opt in ('-m', '--master-direcory'):
            options.master_dir = arg
        elif opt in ('-s', '--slave-direcory'):
            options.slave_dir = arg
        elif opt in ('--force-backup'):
            options.force_backup = True
        else:
            assert False, (opt, arg)

    # Any other arguments are invalid
    if args:
        usage(1, 'Invalid arguments: ' + COMMASPACE.join(args))

    # Sanity checks
    if options.master_dir is None:
        usage(1, '--master-directory is required')
    if options.slave_dir is None:
        usage(1, '--slave-directory is required')

    return options

# afile is a Python file object, or created by gzip.open().  The latter
# doesn't have a fileno() method, so to fsync it we need to reach into
# its underlying file object.
def fsync(afile):
    afile.flush()
    fileobject = getattr(afile, 'fileobj', afile)
    os.fsync(fileobject.fileno())

# Read bytes (no more than n, or to EOF if n is None) in chunks from the
# current position in file fp.  Pass each chunk as an argument to func().
# Return the total number of bytes read == the total number of bytes
# passed in all to func().  Leaves the file position just after the
# last byte read.
def dofile(func, fp, n=None):
    bytesread = 0L
    while n is None or n > 0:
        if n is None:
            todo = READCHUNK
        else:
            todo = min(READCHUNK, n)
        data = fp.read(todo)
        if not data:
            break
        func(data)
        nread = len(data)
        bytesread += nread
        if n is not None:
            n -= nread
    return bytesread


def checksum(fp, n):
    # Checksum the first n bytes of the specified file
    sum = md5.new()
    def func(data):
        sum.update(data)
    dofile(func, fp, n)
    return sum.hexdigest()

def appendfile(options, dst, start, n):
    # Copy bytes from file src, to file dst, starting at offset start, for n
    # length of bytes.  
    sum = md5.new()
    inputfile_path = os.path.join(options.master_dir, options.filename)
    ifp = open(inputfile_path, 'rb')
    ifp.seek(start)
    ofp = open(dst, 'ab')

    def func(data):
        sum.update(data)
        ofp.write(data)

    ndone = dofile(func, ifp, n)
    assert ndone == n

    ifp.close()
    fsync(ofp)
    ofp.close()
    return sum.hexdigest()

def copyfile(options, dst, start, n):
    # Copy bytes from file src, to file dst, starting at offset start, for n
    # length of bytes.  For robustness, we first write, flush and fsync
    # to a temp file, then rename the temp file at the end.
    sum = md5.new()
    inputfile_path = os.path.join(options.master_dir, options.filename)
    ifp = open(inputfile_path, 'rb')
    ifp.seek(start)
    tempname = os.path.join(os.path.dirname(dst), 'tmp.tmp')
    ofp = open(tempname, 'wb')

    def func(data):
        sum.update(data)
        ofp.write(data)

    ndone = dofile(func, ifp, n)
    assert ndone == n

    ifp.close()
    fsync(ofp)
    ofp.close()
    os.rename(tempname, dst)
    return sum.hexdigest()


# Scan the .dat file corresponding to the last full backup performed.
# Return
#
#     filename, startpos, endpos, checksum
#
# of the last incremental.  If there is no .dat file, or the .dat file
# is empty, return
#
#     None, None, None, None
def scandat(options):
    datfile = os.path.join(options.slave_dir, options.filename + '.dat')

    fn = startpos = endpos = sum = None # assume .dat file missing or empty
    try:
        fp = open(datfile)
    except IOError, e:
        if e.errno <> errno.ENOENT:
            raise
    else:
        # We only care about the last one.
        lines = fp.readlines()
        fp.close()
        if lines:
            fn, startpos, endpos, sum = lines[-1].split()
            startpos = long(startpos)
            endpos = long(endpos)

    return fn, startpos, endpos, sum


def do_full_backup(options):
    # Find the file position of the last completed transaction.
    file_path = os.path.join(options.master_dir, options.filename)
    fs = FileStorage(file_path, read_only=True)
    # Note that the FileStorage ctor calls read_index() which scans the file
    # and returns "the position just after the last valid transaction record".
    # getSize() then returns this position, which is exactly what we want,
    # because we only want to copy stuff from the beginning of the file to the
    # last valid transaction record.
    pos = fs.getSize()
    fs.close()
    options.full = True
    dest = os.path.join(options.slave_dir, options.filename)
    if os.path.exists(dest):
        log('overwriting existing file: %s', dest)
    log('writing full backup: %s bytes to %s', pos, dest)
    sum = copyfile(options, dest, 0, pos)
    # Write the data file for this full backup
    datfile = dest + '.dat'
    fp = open(datfile, 'w')
    print >> fp, dest, 0, pos, sum
    fp.flush()
    os.fsync(fp.fileno())
    fp.close()

def do_incremental_backup(options, reposz):
    # Find the file position of the last completed transaction.
    file_path = os.path.join(options.master_dir, options.filename)
    fs = FileStorage(file_path, read_only=True)
    # Note that the FileStorage ctor calls read_index() which scans the file
    # and returns "the position just after the last valid transaction record".
    # getSize() then returns this position, which is exactly what we want,
    # because we only want to copy stuff from the beginning of the file to the
    # last valid transaction record.
    pos = fs.getSize()
    fs.close()
    options.full = False

    dest = os.path.join(options.slave_dir, options.filename)

    if not os.path.exists(dest):
        print >> sys.stderr, 'existing file not found', dest
        sys.exit(2)

    log('writing incremental: %s bytes to %s',  pos-reposz, dest)
    sum = appendfile(options, dest, reposz, pos - reposz)

    datfile = dest + '.dat'

    # This .dat file better exist.  Let the exception percolate if not.
    fp = open(datfile, 'a')
    print >> fp, dest, reposz, pos, sum
    fp.flush()
    os.fsync(fp.fileno())
    fp.close()


def do_backup(options):
    src_file_path = os.path.join(options.master_dir, options.filename)
    target_file_path = os.path.join(options.slave_dir, options.filename)
    srcsz = os.path.getsize(src_file_path)
    targetsz = os.path.exists(target_file_path) and os.path.getsize(target_file_path) or -1
    fn, startpos, endpos, sum = scandat(options)

    # If the .dat file was missing, or was empty, do a full backup
    if (fn, startpos, endpos, sum) == (None, None, None, None):
        log('missing or empty .dat file (full backup)')
        do_full_backup(options)
        return

    # Has the source (master) file shrunk, possibly because of a pack?
    if srcsz < endpos:
        log('file shrunk, possibly because of a pack (doing full backup)')
        do_full_backup(options)
        return

    # Has the source (master) file shrunk, possibly because of a pack?
    if targetsz == -1:
        log('No size for Data.fs: possibly missing? (doing full backup)')
        do_full_backup(options)
        return

    # Has the target (slave) file grown
    # can happen if slave has temporary been used, and data has
    # been written to it.. so ot's not a perfect backup anomore!
    if targetsz > endpos:
        if not options.force_backup:
          print >> sys.stderr, 'Slave (backup) file has grown since last syncing, possibly newer data has been written to this database?. Consider to backup the backup file, then delete it before the syncing will work'
          sys.exit(3)
        else:
          print >> sys.stderr, 'Slave (backup) file has grown since last syncing, possibly newer data has been written to this database?. Since the force-backup parameter was used, a full backup will be done, and overwrite the slave backup'

          do_full_backup(options)
        return

    # Now check the md5 sum of the source file, from the last
    # incremental's start and stop positions.
    srcfp = open(src_file_path, 'rb')
    srcfp.seek(startpos)
    srcsum = checksum(srcfp, endpos-startpos)
    srcfp.close()
    log('last incremental file: %s', fn)
    log('last incremental checksum: %s', sum)
    log('source checksum range: [%s..%s], sum: %s',
        startpos, endpos, srcsum)

    if sum == srcsum:
        if srcsz == endpos:
            log('No changes, nothing to do')
            return
        log('doing incremental, starting at: %s', endpos)
        do_incremental_backup(options, endpos)
        return

    # The checksums don't match, meaning the front of the source file has
    # changed.  We'll need to do a full backup in that case.
    log('file changed, possibly because of a pack (full backup)')
    do_full_backup(options)

def main():
    options = parseargs()
    do_backup(options)

if __name__ == '__main__':
    main()

