Source code for egaia.egaia_derive

# -*- coding: utf-8 -*-
# Requirements:
# libavcodec-extra, inkscape, libreoffice, weasyprint
# libjpeg62
# wget
# wkhtmltopdf

import sys, os, subprocess, re
import shutil
from docopt import docopt
import pkg_resources
import codecs
import base64

from egaia_config import getConfig
import egaia_parsefn
import egaia_list
import egaia_meta
import utils
import egaia_root
import strings

# Do not create derivatives on low-spec systems, such as an appliance built
# on Raspberry Pi. This environment variable needs to be set manually in the
# configuration file. If derivatives creation is disabled here, we should
# synchronize the archive with another system (e.g., with rsync or btsync) and
# run the command there.
if getConfig('system', 'no_deriv', boolean=True) is True:
    exit('This command is disabled on the current system.')

# FIXME: move the getConfig() functions out of the main namespace
cmd_convert = getConfig('system', 'cmd_convert')
cmd_mogrify = getConfig('system', 'cmd_mogrify')
cmd_inkscape = getConfig('system', 'cmd_inkscape')
cmd_ffmpeg = getConfig('system', 'cmd_ffmpeg')
cmd_libreoffice = getConfig('system', 'cmd_libreoffice')
cmd_wget = getConfig('system', 'cmd_wget')
cmd_wkhtmltopdf = getConfig('system', 'cmd_wkhtmltopdf')
cmd_wkhtmltoimage = getConfig('system', 'cmd_wkhtmltoimage')

# mapping of formats and known extensions
knownFormats = {
    'audio': ('mp3', 'wav', 'wma', 'ogg'),
    'doc': ('odt', 'odp', 'doc', 'docx', 'ppt', 'pptx'),
    'raster': ('bmp', 'gif', 'jpg', 'jpeg', 'png', 'psd', 'tiff', 'tif'),
    'vector': ('eps', 'svg'),
    'video': ('avi', 'flv', 'mov', 'mpeg', 'mwv', 'mp4', 'webm', 'ogv', 'mkv'),
    'pdf': ('pdf',),
    'text': ('txt', 'rst', 'md'),
    'url': ('url',),
    'vclips': ('vclips',)
        }

# mapping of formats and processing rules
# https://www.archivematica.org/wiki/Format_policies

# we use "med" and "thumb" as derived image sizes to make these accessible
# to html writers, even if we update the size settings later
# the video format is hard-coded because we can offer a set of options in
# the video tag
outputFormats = {
    'audio': [  ('pf-wav', 'wav'),
                ('df-mp3', 'mp3')],
    'doc':    [ ('df-pdf', 'pdf'),
                ('df-html', 'html'),
                ('df-med-img','jpg'),
                ('df-thumb-img','jpg'),],
    'raster': [ ('pf-tiff', 'tiff'),
                ('df-med-img','jpg'),
                ('df-thumb-img','jpg'),],
    'vector': [ ('pf-vector', 'svg'),
                ('df-pdf', 'pdf'),
                ('df-med-img','png'),
                ('df-thumb-img','png'),],
    'video':  [ ('pf-ffv1', 'mkv'),
                ('df-h264', 'mp4'),
                ('df-360p-vp9-400k', 'webm'),
                ('df-stills', 'dir'),
                ('df-contact-sheet', 'html'),
                ('df-thumb-vid', 'jpg'),
                ('df-med-img-vid', 'jpg'),
                ],
    'pdf':    [ ('df-pdf', 'pdf'),
                ('df-thumb-img', 'jpg'),
                ('df-med-img', 'jpg'),],
    'text':   [ ('df-txt', 'txt'),
                ('df-html', 'html'),
                ],
    'url':    [ ('df-webarc', 'dir'),
                ('df-pdf', 'pdf'),
                ('df-screenshot', 'png'),
                ('df-med-img','png'),
                ('df-thumb-img','png'),],
    'vclips': [ ('df-concat-list', 'txt'),
                ('pf-ffv1', 'mkv'),
                ('df-h264', 'mp4'),
                ('df-360p-vp9-400k', 'webm'),
                ('df-stills', 'dir'),
                ('df-contact-sheet', 'html'),
                ('df-thumb-vid', 'jpg'),
                ('df-med-img-vid', 'jpg'),
                ],

        }


[docs]def run(arguments):
    """Run an external command. Do nothing if the "dry-run" flag is set."""
    return utils.run(arguments)


[docs]def makeDeriv(rule, mtype, fn, UUID, tmpFile, outDir, outFile, basename, N):
    """Create a derivative. This function contains the processing rules and
    commands."""
    
    print "processing %s" % fn
    print "rule %s" % rule

    if rule == 'df-txt' and mtype in ('text',):
        r = shutil.copyfile(fn, tmpFile)

    elif rule == 'df-html' and mtype is 'text':
        input_file = codecs.open(fn, mode='r', encoding='utf-8')
        text = input_file.read()
        html = utils.md2html(text)
        if html:
            with codecs.open(tmpFile, mode='w', encoding='utf-8') as html_file:
                html_file.write(html)
        r = True

    elif rule == 'df-html' and mtype is 'doc' and fn.endswith('.docx'):
        html = utils.docx2html(fn)
        if html:
            with codecs.open(tmpFile, mode='w', encoding='utf-8') as html_file:
                html_file.write(html)
        r = True

    elif rule == 'df-pdf' and mtype is 'pdf':
        r = shutil.copyfile(fn, tmpFile)

    elif rule == 'df-webarc':
        r = run([cmd_wget, '--input-file=%s' % fn, '--convert-links',
            '--page-requisites', '--span-hosts', '--adjust-extension',
            '--restrict-file-names=windows',
            '--directory-prefix=%s' % tmpFile])

    elif rule == 'pf-wav':
        r = run([cmd_ffmpeg, '-i', fn, tmpFile])

    elif rule == 'df-mp3':
        r = run([cmd_ffmpeg, '-i', fn, tmpFile])

    elif rule == 'df-pdf' and mtype is 'doc':
        print "converting with libreoffice..."
        r = run([cmd_libreoffice, '--headless', '--convert-to',
            'pdf', '--outdir', outDir, fn])
        
        #r = run([cmd_convert, fn, tmpFile])
        
        # Rename, since libreoffice doesn't let us specify a name
        pdfFile = '%s.%s.pdf' % (basename, UUID)
        try:
            os.rename(os.path.join(outDir, pdfFile), tmpFile)
        except:
            # this will fail if LibreOffice is already open
            print "Error converting to PDF. Is LibreOffice already open?"

    elif rule == 'df-pdf' and mtype is 'url':
        with open(fn, 'r') as urlList:
            urls = urlList.read().splitlines()
        r = run([cmd_wkhtmltopdf, urls[N], tmpFile])

    elif rule == 'df-screenshot' and mtype is 'url':
        # FIXME: Don't keep downloading the resource;
        # set a persistent cache file
        with open(fn, 'r') as urlList:
            urls = urlList.read().splitlines()
        r = run([cmd_wkhtmltoimage, urls[N], tmpFile])

    elif rule == 'df-med-img' and mtype is 'url':
        with open(fn, 'r') as urlList:
            urls = urlList.read().splitlines()
        r = run([cmd_wkhtmltoimage, '--crop-h', '800', '--quality', '60',
                urls[N], tmpFile])

    elif rule == 'df-med-img':
        if mtype in ('vector', 'doc'):
            # use the generated pdf, not the original document
            fn = os.path.join(outDir, '%s.df-pdf.%s.pdf' % (basename, UUID))
            print "creating thumbnail from %s..." % fn
        r = run([cmd_convert, '-density', '300', '%s[%s]' % (fn, N), 
            '-resize', '800x600>',
            '-background', 'white', '-alpha', 'remove', '-auto-orient',
            tmpFile])

    elif rule == 'df-thumb-img':
        if mtype in ('vector', 'doc'):
            # use the generated pdf, not the original document
            fn = os.path.join(outDir, '%s.df-pdf.%s.pdf' % (basename, UUID))
        elif mtype in ('url',):
            fn = '%s.df-med-img.%s.png' % (basename, UUID)

        r = run([cmd_convert, '%s[%s]' % (fn, N), '-resize', '320x320>',
            '-background', 'white', '-alpha', 'remove', '-auto-orient',
            tmpFile])

        #~ r = run([cmd_convert, 
                #~ '%s[%s]' % (fn, N), '-thumbnail', '320x320^',
                #~ '-background', 'white', '-alpha', 'remove', '-auto-orient',
                #~ '-gravity', 'center', '-extent', '320x320',
                #~ tmpFile])

    elif rule == 'pf-tiff':
        r = run([cmd_convert, '-compress', 'none', '%s[%s]' % (fn, N),
            tmpFile])

    elif rule == 'pf-vector':
        r = run([cmd_inkscape, fn, '--export-plain-svg=%s' % tmpFile])
    
    elif rule == 'df-pdf' and mtype is 'vector':
        r = run([cmd_inkscape, fn, '--export-pdf=%s' % tmpFile])
    
    
    ## VIDEO RULES
    
    elif rule == 'df-concat-list':
        
        if not os.path.isdir(fn):
            print "%s is not a directory" % fn
            return
        
        offsets = list()
        offset = 0
        
        # get the list of clips in the top of the directory, in mtime order
        filenames = list()
        for root, dirs, files in os.walk(fn):
            for f in files:
                filenames.append(os.path.join(root, f))

        filenames.sort(key=lambda x: os.path.getmtime(x))
        
        clips = list()
        for filename in filenames:
            (root, ext) = os.path.splitext(filename)
            if ext.lower() in ('.avi', '.mov', '.mpeg', '.mp4', 
                                '.webm', '.ogv', '.mkv'):

                try:
                    duration = egaia_meta.getDuration(filename)
                except:
                    print "Error probing %s; skipping!" % filename
                    continue

                clip_path = os.path.relpath(filename, os.path.dirname(fn))
                clips.append(u'file %s' % clip_path)
                clips.append(u'duration %s\n' % duration)
                                
                offsets.append(u','.join([clip_path, str(offset)]))
                offset = offset + float(duration)
        
        if not clips:
            print "No video files to add!"
            return
        
        # write the ffconcat script
        with codecs.open(tmpFile, mode='w', encoding='utf-8') as out:
            out.write('ffconcat version 1.0\n\n' + '\n'.join(clips))
        
        # write the offsets database
        db = os.path.join(outDir, '%s.df-concat-offsets.%s.csv' % (basename, UUID))
        
        with codecs.open(db, mode='w', encoding='utf-8') as out:
            out.write('\n'.join(offsets))
        
        r = True
    
    elif rule == 'df-h264':
        
        cmd_args = [cmd_ffmpeg]

        if mtype == 'vclips':
            # use the clips concat list as input
            fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID))
            cmd_args = cmd_args + [
                '-f', 'concat',
                '-segment_time_metadata', '1']
            
        r = run(cmd_args + [
                '-i', fn, 
                '-vcodec', 'libx264',
                '-acodec', 'aac',
                '-ab', '384K',
                '-crf', '21',
                '-bf', '2',
                '-flags', '+cgop',
                '-pix_fmt', 'yuv420p',
                '-movflags', 'faststart',
                '-threads', getConfig('system', 'cores'),
                tmpFile] )
        
    elif rule == 'df-360p-vp9-400k':

        cmd_args = [cmd_ffmpeg]

        if mtype == 'vclips':
            # use the clips concat list as input
            fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID))
            cmd_args = cmd_args + [
                '-f', 'concat',
                '-segment_time_metadata', '1']
        
        # use tmpFile instead of /dev/null or NUL to avoid prompts
        # to overwrite, etc.
        # http://wiki.webmproject.org/ffmpeg/vp9-encoding-guide

        r1 = run(cmd_args + [ 
                '-i', fn, 
                '-codec:v', 'libvpx-vp9',
                '-pass', '1', 
                '-b:v', '400K', 
                '-crf', '33', 
                '-threads', getConfig('system', 'cores'),
                '-speed', '4', 
                '-tile-columns', '6', 
                '-frame-parallel', '1',
                '-vf', 'scale=-1:360', 
                '-an', 
                '-f', 'webm', 
                tmpFile] )

        os.unlink(tmpFile)

        r = run(cmd_args + [ 
                '-i', fn, 
                '-codec:v', 'libvpx-vp9',
                '-pass', '2', 
                '-b:v', '400K', 
                '-crf', '33', 
                '-threads', getConfig('system', 'cores'),
                '-speed', '1', 
                '-tile-columns', '6', 
                '-frame-parallel', '1',
                '-vf', 'scale=-1:360', 
                '-auto-alt-ref', '1', 
                '-lag-in-frames', '25', 
                '-c:a', 'libopus', 
                '-b:a', '64k', 
                '-f', 'webm', 
                tmpFile ])
        
        # Remove the ffmpeg passlogfile here
        pattern = "^ffmpeg2pass.*$"
        for root, dirs, files in os.walk(os.getcwd()):
            for file in filter(lambda x: re.match(pattern, x), files):
                os.remove(os.path.join(root, file))

    elif rule == 'pf-ffv1':
        
        # We need to enable ffv1 conversion explicitly
        if getConfig('system', 'ffv1', boolean=True) is False:
            return

        cmd_args = [cmd_ffmpeg]

        if mtype == 'vclips':
            # use the clips concat list as input
            fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID))
            cmd_args = cmd_args + [
                '-f', 'concat',
                '-segment_time_metadata', '1']
                
        # FIXME: Improve compression settings to get smaller files,
        # or else disable conversion from open standard formats?
        
        r = run(cmd_args + [ 
                '-i', fn, 
                '-vcodec', 'ffv1',
                '-acodec', 'pcm_s16le', 
                '-threads', getConfig('system', 'cores'),
                tmpFile] )

    elif rule == 'df-med-img-vid':

        if mtype == 'vclips':
            # derive thumb from the concatenated h264 clip
            fn = os.path.join(outDir, '%s.df-h264.%s.mp4' % (basename, UUID))

        # copy the main thumbnail image from halfway in
        seek_time = float(egaia_meta.getDuration(fn)) // 2

        # put -ss TIME *before* the input to use keyframe seeking (fast!)
        r = run([cmd_ffmpeg, '-ss', '%s' % int(seek_time), '-i', fn,
            '-frames:v', '1', '-vf', 'scale=800:-1', tmpFile])

    elif rule == 'df-thumb-vid':

        if mtype == 'vclips':
            # derive thumb from the concatenated h264 clip
            fn = os.path.join(outDir, '%s.df-h264.%s.mp4' % (basename, UUID))
            
        # copy the main thumbnail image from halfway in
        seek_time = float(egaia_meta.getDuration(fn)) // 2
        
        # put -ss TIME *before* the input to use keyframe seeking (fast!)
        r = run([cmd_ffmpeg, '-ss', '%s' % int(seek_time), '-i', fn,
            '-frames:v', '1', '-vf', 'scale=320:-1', tmpFile])

    elif rule == 'df-stills':

        # here `outFile` is a directory path, not a file
        if os.path.isdir(outFile):
            # we need to delete the directory if the ``--force`` flag is given
            shutil.rmtree(outFile)
        os.mkdir(outFile)
        # use just the filename, otherwise files get written to the wrong place
        basename_fn = os.path.basename(basename)

        if mtype == 'vclips':

            # get the list of clips in the directory, in mtime order
            filenames = list()
            for root, dirs, files in os.walk(fn):
                for f in files:
                    filenames.append(os.path.join(root, f))
            filenames.sort(key=lambda x: os.path.getmtime(x))
           
            clips = list()
            for filename in filenames:
                (root, ext) = os.path.splitext(filename)
                if ext.lower() in ('.avi', '.mov', '.mpeg', '.mp4', 
                                    '.webm', '.ogv', '.mkv'):
                    clips.append(filename)
        else:
            clips = [fn]
            
        for n, c in enumerate(clips):
            print c
            
            r = run([ cmd_ffmpeg, 
                      '-i', c, 
                      '-vf', 'fps=1/6',
                      os.path.join(outFile, 
                        '%03d.%s.%%05d.jpg' % (n, egaia_parsefn.getBasename(c)))
                    ])
                    
            if r is False:
                continue

        # resize the images to something sensible
        r = run([cmd_mogrify, '-units', 'PixelsPerInch', '-density', 
                    '300', '-resize', '240x180>',
                    os.path.join(outFile, '*.jpg')])

        images = os.listdir(outFile)
        images.sort()
        
        for idx, image in enumerate(images):
            
            # [clip no., basename parts..., frame no., ext]
            image_parts = image.split('.') 
            
            clipname = '.'.join(image_parts[1:-2])
            # image numbering starts at 1; time starts at 0
            N = int(image_parts[-2])-1
            if N < 0:
                N = 0
            secs = N*6

            # rename to include second count rather than timestamp
            new_image = '%s.df-still.%05d.%s.jpg' % (clipname, secs, UUID)
            os.rename(os.path.join(outFile, image), os.path.join(outFile, new_image))

    elif rule == 'df-contact-sheet':

        # get the list of images
        stills_dir = os.path.join(outDir, '%s.df-stills.%s.dir' % (basename, UUID))
        if not os.path.isdir(stills_dir):
            print "ERROR: %s is not a directory or does not exist" % stills_dir
        images = os.listdir(stills_dir)
        images.sort()
        
        # create html page. 
        # Write directly so as to limit memory use -- is this an issue with
        # smaller thumbnails?
        html = open(outFile, 'w')
        html.write(u'<!DOCTYPE HTML>')
        html.write(u'<html><head></head>')
        html.write(u'<body>')
        html.write(u'<h1>%s</h1>' % UUID)
        
        for idx, image in enumerate(images):
            
            # [clip no., basename parts..., frame no., ext]
            image_parts = image.split('.') 
            clipname = '.'.join(image_parts[0:-4])
            secs = int(image_parts[-3])
            
            t = utils.fmtTime(secs)

            if secs == 0:
                # first thumb. idx won't work in clip sequences.
                in_point = '0'
                out_point = '3'
            elif idx == len(images) - 1:
                # last thumb
                in_point = str(secs-3)
                out_point = str(secs)
            else:
                in_point = str(secs-3)
                out_point = str(secs+3)
            
            cb_txt = u' '.join([UUID, clipname, in_point, out_point])
            
            img_fn = os.path.join(stills_dir, image) 
            encoded_img = base64.b64encode(open(img_fn, "rb").read())
            # FIXME: Don't overshoot the last image in concatenated sequences
            html.write(u' '.join([ 
                        u'\n<img src="data:image/jpeg;base64,%s"' % encoded_img,
                        u'title="%s %s"' % (clipname, t),
                        u'class="btn"',
                        u'data-clipboard-text="%s">' % cb_txt
                                    ]))

        html.write(u'<script type="text/javascript">')
        html.write(strings.clipboard_js)
        html.write(strings.clipboard_js_call)
        html.write(u'</script>')
        html.write(u'</body></html>')
        html.close()
        
        r = True

    else:
        # programming error?
        return
    
    if r is False:
        return
    
    if os.path.exists(tmpFile):
        # move temporary file to the working directory
        os.rename(tmpFile, outFile)
        print "Converted to %s" % outFile
    
    return


[docs]def process(uuid=None, frame='0', force=False, update=False):
    """Generate a list of files to process"""

    if uuid:
        filenames = egaia_list.listFiles(filter_type='originals', uuid=uuid)
    else:
        filenames = egaia_list.listFiles(filter_type='originals')
    
    filenames = filenames + egaia_list.listDirs(filter_type='originals')
    
   
    if not filenames:
        print "Nothing to process!"
        return
    
    for fn in filenames:
        findRules(fn, frame, force, update)
    
    return

[docs]def findRules(fn, frame, force, update):
    """Find the matching processing rule(s) for a given filename"""
    
    (basename, UUID, extension) = egaia_parsefn.parseFilename(fn)
        
    if UUID is None:
        # don't automatically add uuid if missing; this should be done manually
        print "No uuid tag for this file; ignoring"
        return
        
    ext = extension.lower()
        
    mtype = None
    for fileFormat, extensions in knownFormats.iteritems():
        if ext in extensions:
            mtype = fileFormat
            break
        
    if not mtype:
        print 'No conversion rule for this file; ignoring'
        return
    
    for (rule, suffix) in outputFormats[mtype]:
        
        outFn = '.'.join([basename, rule, UUID, suffix])
        
        # If items are in subdirectories, we want to keep those.
        outDir = os.path.dirname(fn)
        outFile = os.path.join(outDir, outFn)
        
        # FIXME: This is not a very robust check -- we may have a derivative
        # that has been moved to a different directory.
        if os.path.exists(outFile):
            if not force and not update:
                print '%s already exists' % outFile
                continue
            if update:
                # check the modification times of the source and destination
                src_mtime = os.stat(fn).st_mtime
                dest_mtime = os.stat(outFile).st_mtime

                # don't copy/link if the destination is newer or the same
                if src_mtime <= dest_mtime:
                    #print "%s is newer than the source; ignoring" % outFile
                    continue

        # We will use suffixed temporary file: <FILENAME>.tmp.<EXT>
        # A temporary file can safely be deleted if the conversion fails
        # We can't use a prefix here because fn might include directory paths
        tmpFile = '.'.join([basename, rule, UUID, 'tmp', suffix])
        
        makeDeriv(rule, mtype, fn, UUID, tmpFile, outDir, outFile, basename, frame)
    

def _cli(args):
    """egaia derive

    Generate derivative formats for archival storage and for distribution.

    Usage:
        egaia derive --help
        egaia derive [ --item=ITEM ] [ --frame=N ] [ --force | --update ]
    
    """

    if not egaia_root.get_root():
        exit('Please run "egaia init" to create a new collection here.')
    
    frame = args['--frame']
    
    if frame is None:
        frame = '1'
    
    # use input numbering that starts with 1
    frame = int(frame) - 1

    if args['--item']:
        process(uuid=egaia_parsefn.getUuid(args['--item']), frame=frame,
            force=args['--force'])
    else:
        process(frame=frame, force=args['--force'], update=args['--update'])
    return