Source code for egaia.egaia_derive

# -*- coding: utf-8 -*-
# Requirements:
# libavcodec-extra, inkscape, libreoffice, weasyprint
# libjpeg62
# wget
# wkhtmltopdf

import sys, os, subprocess, re
import shutil
from docopt import docopt
import pkg_resources
import codecs
import base64

from egaia_config import getConfig
import egaia_parsefn
import egaia_list
import egaia_meta
import utils
import egaia_root
import strings

# Do not create derivatives on low-spec systems, such as an appliance built
# on Raspberry Pi. This environment variable needs to be set manually in the
# configuration file. If derivatives creation is disabled here, we should
# synchronize the archive with another system (e.g., with rsync or btsync) and
# run the command there.
if getConfig('system', 'no_deriv', boolean=True) is True:
    exit('This command is disabled on the current system.')

# FIXME: move the getConfig() functions out of the main namespace
cmd_convert = getConfig('system', 'cmd_convert')
cmd_mogrify = getConfig('system', 'cmd_mogrify')
cmd_inkscape = getConfig('system', 'cmd_inkscape')
cmd_ffmpeg = getConfig('system', 'cmd_ffmpeg')
cmd_libreoffice = getConfig('system', 'cmd_libreoffice')
cmd_wget = getConfig('system', 'cmd_wget')
cmd_wkhtmltopdf = getConfig('system', 'cmd_wkhtmltopdf')
cmd_wkhtmltoimage = getConfig('system', 'cmd_wkhtmltoimage')

# mapping of formats and known extensions
knownFormats = {
    'audio': ('mp3', 'wav', 'wma', 'ogg'),
    'doc': ('odt', 'odp', 'doc', 'docx', 'ppt', 'pptx'),
    'raster': ('bmp', 'gif', 'jpg', 'jpeg', 'png', 'psd', 'tiff', 'tif'),
    'vector': ('eps', 'svg'),
    'video': ('avi', 'flv', 'mov', 'mpeg', 'mwv', 'mp4', 'webm', 'ogv', 'mkv'),
    'pdf': ('pdf',),
    'text': ('txt', 'rst', 'md'),
    'url': ('url',),
    'vclips': ('vclips',)
        }

# mapping of formats and processing rules
# https://www.archivematica.org/wiki/Format_policies

# we use "med" and "thumb" as derived image sizes to make these accessible
# to html writers, even if we update the size settings later
# the video format is hard-coded because we can offer a set of options in
# the video tag
outputFormats = {
    'audio': [  ('pf-wav', 'wav'),
                ('df-mp3', 'mp3')],
    'doc':    [ ('df-pdf', 'pdf'),
                ('df-html', 'html'),
                ('df-med-img','jpg'),
                ('df-thumb-img','jpg'),],
    'raster': [ ('pf-tiff', 'tiff'),
                ('df-med-img','jpg'),
                ('df-thumb-img','jpg'),],
    'vector': [ ('pf-vector', 'svg'),
                ('df-pdf', 'pdf'),
                ('df-med-img','png'),
                ('df-thumb-img','png'),],
    'video':  [ ('pf-ffv1', 'mkv'),
                ('df-h264', 'mp4'),
                ('df-360p-vp9-400k', 'webm'),
                ('df-stills', 'dir'),
                ('df-contact-sheet', 'html'),
                ('df-thumb-vid', 'jpg'),
                ('df-med-img-vid', 'jpg'),
                ],
    'pdf':    [ ('df-pdf', 'pdf'),
                ('df-thumb-img', 'jpg'),
                ('df-med-img', 'jpg'),],
    'text':   [ ('df-txt', 'txt'),
                ('df-html', 'html'),
                ],
    'url':    [ ('df-webarc', 'dir'),
                ('df-pdf', 'pdf'),
                ('df-screenshot', 'png'),
                ('df-med-img','png'),
                ('df-thumb-img','png'),],
    'vclips': [ ('df-concat-list', 'txt'),
                ('pf-ffv1', 'mkv'),
                ('df-h264', 'mp4'),
                ('df-360p-vp9-400k', 'webm'),
                ('df-stills', 'dir'),
                ('df-contact-sheet', 'html'),
                ('df-thumb-vid', 'jpg'),
                ('df-med-img-vid', 'jpg'),
                ],

        }


[docs]def run(arguments): """Run an external command. Do nothing if the "dry-run" flag is set.""" return utils.run(arguments)
[docs]def makeDeriv(rule, mtype, fn, UUID, tmpFile, outDir, outFile, basename, N): """Create a derivative. This function contains the processing rules and commands.""" print "processing %s" % fn print "rule %s" % rule if rule == 'df-txt' and mtype in ('text',): r = shutil.copyfile(fn, tmpFile) elif rule == 'df-html' and mtype is 'text': input_file = codecs.open(fn, mode='r', encoding='utf-8') text = input_file.read() html = utils.md2html(text) if html: with codecs.open(tmpFile, mode='w', encoding='utf-8') as html_file: html_file.write(html) r = True elif rule == 'df-html' and mtype is 'doc' and fn.endswith('.docx'): html = utils.docx2html(fn) if html: with codecs.open(tmpFile, mode='w', encoding='utf-8') as html_file: html_file.write(html) r = True elif rule == 'df-pdf' and mtype is 'pdf': r = shutil.copyfile(fn, tmpFile) elif rule == 'df-webarc': r = run([cmd_wget, '--input-file=%s' % fn, '--convert-links', '--page-requisites', '--span-hosts', '--adjust-extension', '--restrict-file-names=windows', '--directory-prefix=%s' % tmpFile]) elif rule == 'pf-wav': r = run([cmd_ffmpeg, '-i', fn, tmpFile]) elif rule == 'df-mp3': r = run([cmd_ffmpeg, '-i', fn, tmpFile]) elif rule == 'df-pdf' and mtype is 'doc': print "converting with libreoffice..." r = run([cmd_libreoffice, '--headless', '--convert-to', 'pdf', '--outdir', outDir, fn]) #r = run([cmd_convert, fn, tmpFile]) # Rename, since libreoffice doesn't let us specify a name pdfFile = '%s.%s.pdf' % (basename, UUID) try: os.rename(os.path.join(outDir, pdfFile), tmpFile) except: # this will fail if LibreOffice is already open print "Error converting to PDF. Is LibreOffice already open?" elif rule == 'df-pdf' and mtype is 'url': with open(fn, 'r') as urlList: urls = urlList.read().splitlines() r = run([cmd_wkhtmltopdf, urls[N], tmpFile]) elif rule == 'df-screenshot' and mtype is 'url': # FIXME: Don't keep downloading the resource; # set a persistent cache file with open(fn, 'r') as urlList: urls = urlList.read().splitlines() r = run([cmd_wkhtmltoimage, urls[N], tmpFile]) elif rule == 'df-med-img' and mtype is 'url': with open(fn, 'r') as urlList: urls = urlList.read().splitlines() r = run([cmd_wkhtmltoimage, '--crop-h', '800', '--quality', '60', urls[N], tmpFile]) elif rule == 'df-med-img': if mtype in ('vector', 'doc'): # use the generated pdf, not the original document fn = os.path.join(outDir, '%s.df-pdf.%s.pdf' % (basename, UUID)) print "creating thumbnail from %s..." % fn r = run([cmd_convert, '-density', '300', '%s[%s]' % (fn, N), '-resize', '800x600>', '-background', 'white', '-alpha', 'remove', '-auto-orient', tmpFile]) elif rule == 'df-thumb-img': if mtype in ('vector', 'doc'): # use the generated pdf, not the original document fn = os.path.join(outDir, '%s.df-pdf.%s.pdf' % (basename, UUID)) elif mtype in ('url',): fn = '%s.df-med-img.%s.png' % (basename, UUID) r = run([cmd_convert, '%s[%s]' % (fn, N), '-resize', '320x320>', '-background', 'white', '-alpha', 'remove', '-auto-orient', tmpFile]) #~ r = run([cmd_convert, #~ '%s[%s]' % (fn, N), '-thumbnail', '320x320^', #~ '-background', 'white', '-alpha', 'remove', '-auto-orient', #~ '-gravity', 'center', '-extent', '320x320', #~ tmpFile]) elif rule == 'pf-tiff': r = run([cmd_convert, '-compress', 'none', '%s[%s]' % (fn, N), tmpFile]) elif rule == 'pf-vector': r = run([cmd_inkscape, fn, '--export-plain-svg=%s' % tmpFile]) elif rule == 'df-pdf' and mtype is 'vector': r = run([cmd_inkscape, fn, '--export-pdf=%s' % tmpFile]) ## VIDEO RULES elif rule == 'df-concat-list': if not os.path.isdir(fn): print "%s is not a directory" % fn return offsets = list() offset = 0 # get the list of clips in the top of the directory, in mtime order filenames = list() for root, dirs, files in os.walk(fn): for f in files: filenames.append(os.path.join(root, f)) filenames.sort(key=lambda x: os.path.getmtime(x)) clips = list() for filename in filenames: (root, ext) = os.path.splitext(filename) if ext.lower() in ('.avi', '.mov', '.mpeg', '.mp4', '.webm', '.ogv', '.mkv'): try: duration = egaia_meta.getDuration(filename) except: print "Error probing %s; skipping!" % filename continue clip_path = os.path.relpath(filename, os.path.dirname(fn)) clips.append(u'file %s' % clip_path) clips.append(u'duration %s\n' % duration) offsets.append(u','.join([clip_path, str(offset)])) offset = offset + float(duration) if not clips: print "No video files to add!" return # write the ffconcat script with codecs.open(tmpFile, mode='w', encoding='utf-8') as out: out.write('ffconcat version 1.0\n\n' + '\n'.join(clips)) # write the offsets database db = os.path.join(outDir, '%s.df-concat-offsets.%s.csv' % (basename, UUID)) with codecs.open(db, mode='w', encoding='utf-8') as out: out.write('\n'.join(offsets)) r = True elif rule == 'df-h264': cmd_args = [cmd_ffmpeg] if mtype == 'vclips': # use the clips concat list as input fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID)) cmd_args = cmd_args + [ '-f', 'concat', '-segment_time_metadata', '1'] r = run(cmd_args + [ '-i', fn, '-vcodec', 'libx264', '-acodec', 'aac', '-ab', '384K', '-crf', '21', '-bf', '2', '-flags', '+cgop', '-pix_fmt', 'yuv420p', '-movflags', 'faststart', '-threads', getConfig('system', 'cores'), tmpFile] ) elif rule == 'df-360p-vp9-400k': cmd_args = [cmd_ffmpeg] if mtype == 'vclips': # use the clips concat list as input fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID)) cmd_args = cmd_args + [ '-f', 'concat', '-segment_time_metadata', '1'] # use tmpFile instead of /dev/null or NUL to avoid prompts # to overwrite, etc. # http://wiki.webmproject.org/ffmpeg/vp9-encoding-guide r1 = run(cmd_args + [ '-i', fn, '-codec:v', 'libvpx-vp9', '-pass', '1', '-b:v', '400K', '-crf', '33', '-threads', getConfig('system', 'cores'), '-speed', '4', '-tile-columns', '6', '-frame-parallel', '1', '-vf', 'scale=-1:360', '-an', '-f', 'webm', tmpFile] ) os.unlink(tmpFile) r = run(cmd_args + [ '-i', fn, '-codec:v', 'libvpx-vp9', '-pass', '2', '-b:v', '400K', '-crf', '33', '-threads', getConfig('system', 'cores'), '-speed', '1', '-tile-columns', '6', '-frame-parallel', '1', '-vf', 'scale=-1:360', '-auto-alt-ref', '1', '-lag-in-frames', '25', '-c:a', 'libopus', '-b:a', '64k', '-f', 'webm', tmpFile ]) # Remove the ffmpeg passlogfile here pattern = "^ffmpeg2pass.*$" for root, dirs, files in os.walk(os.getcwd()): for file in filter(lambda x: re.match(pattern, x), files): os.remove(os.path.join(root, file)) elif rule == 'pf-ffv1': # We need to enable ffv1 conversion explicitly if getConfig('system', 'ffv1', boolean=True) is False: return cmd_args = [cmd_ffmpeg] if mtype == 'vclips': # use the clips concat list as input fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID)) cmd_args = cmd_args + [ '-f', 'concat', '-segment_time_metadata', '1'] # FIXME: Improve compression settings to get smaller files, # or else disable conversion from open standard formats? r = run(cmd_args + [ '-i', fn, '-vcodec', 'ffv1', '-acodec', 'pcm_s16le', '-threads', getConfig('system', 'cores'), tmpFile] ) elif rule == 'df-med-img-vid': if mtype == 'vclips': # derive thumb from the concatenated h264 clip fn = os.path.join(outDir, '%s.df-h264.%s.mp4' % (basename, UUID)) # copy the main thumbnail image from halfway in seek_time = float(egaia_meta.getDuration(fn)) // 2 # put -ss TIME *before* the input to use keyframe seeking (fast!) r = run([cmd_ffmpeg, '-ss', '%s' % int(seek_time), '-i', fn, '-frames:v', '1', '-vf', 'scale=800:-1', tmpFile]) elif rule == 'df-thumb-vid': if mtype == 'vclips': # derive thumb from the concatenated h264 clip fn = os.path.join(outDir, '%s.df-h264.%s.mp4' % (basename, UUID)) # copy the main thumbnail image from halfway in seek_time = float(egaia_meta.getDuration(fn)) // 2 # put -ss TIME *before* the input to use keyframe seeking (fast!) r = run([cmd_ffmpeg, '-ss', '%s' % int(seek_time), '-i', fn, '-frames:v', '1', '-vf', 'scale=320:-1', tmpFile]) elif rule == 'df-stills': # here `outFile` is a directory path, not a file if os.path.isdir(outFile): # we need to delete the directory if the ``--force`` flag is given shutil.rmtree(outFile) os.mkdir(outFile) # use just the filename, otherwise files get written to the wrong place basename_fn = os.path.basename(basename) if mtype == 'vclips': # get the list of clips in the directory, in mtime order filenames = list() for root, dirs, files in os.walk(fn): for f in files: filenames.append(os.path.join(root, f)) filenames.sort(key=lambda x: os.path.getmtime(x)) clips = list() for filename in filenames: (root, ext) = os.path.splitext(filename) if ext.lower() in ('.avi', '.mov', '.mpeg', '.mp4', '.webm', '.ogv', '.mkv'): clips.append(filename) else: clips = [fn] for n, c in enumerate(clips): print c r = run([ cmd_ffmpeg, '-i', c, '-vf', 'fps=1/6', os.path.join(outFile, '%03d.%s.%%05d.jpg' % (n, egaia_parsefn.getBasename(c))) ]) if r is False: continue # resize the images to something sensible r = run([cmd_mogrify, '-units', 'PixelsPerInch', '-density', '300', '-resize', '240x180>', os.path.join(outFile, '*.jpg')]) images = os.listdir(outFile) images.sort() for idx, image in enumerate(images): # [clip no., basename parts..., frame no., ext] image_parts = image.split('.') clipname = '.'.join(image_parts[1:-2]) # image numbering starts at 1; time starts at 0 N = int(image_parts[-2])-1 if N < 0: N = 0 secs = N*6 # rename to include second count rather than timestamp new_image = '%s.df-still.%05d.%s.jpg' % (clipname, secs, UUID) os.rename(os.path.join(outFile, image), os.path.join(outFile, new_image)) elif rule == 'df-contact-sheet': # get the list of images stills_dir = os.path.join(outDir, '%s.df-stills.%s.dir' % (basename, UUID)) if not os.path.isdir(stills_dir): print "ERROR: %s is not a directory or does not exist" % stills_dir images = os.listdir(stills_dir) images.sort() # create html page. # Write directly so as to limit memory use -- is this an issue with # smaller thumbnails? html = open(outFile, 'w') html.write(u'<!DOCTYPE HTML>') html.write(u'<html><head></head>') html.write(u'<body>') html.write(u'<h1>%s</h1>' % UUID) for idx, image in enumerate(images): # [clip no., basename parts..., frame no., ext] image_parts = image.split('.') clipname = '.'.join(image_parts[0:-4]) secs = int(image_parts[-3]) t = utils.fmtTime(secs) if secs == 0: # first thumb. idx won't work in clip sequences. in_point = '0' out_point = '3' elif idx == len(images) - 1: # last thumb in_point = str(secs-3) out_point = str(secs) else: in_point = str(secs-3) out_point = str(secs+3) cb_txt = u' '.join([UUID, clipname, in_point, out_point]) img_fn = os.path.join(stills_dir, image) encoded_img = base64.b64encode(open(img_fn, "rb").read()) # FIXME: Don't overshoot the last image in concatenated sequences html.write(u' '.join([ u'\n<img src="data:image/jpeg;base64,%s"' % encoded_img, u'title="%s %s"' % (clipname, t), u'class="btn"', u'data-clipboard-text="%s">' % cb_txt ])) html.write(u'<script type="text/javascript">') html.write(strings.clipboard_js) html.write(strings.clipboard_js_call) html.write(u'</script>') html.write(u'</body></html>') html.close() r = True else: # programming error? return if r is False: return if os.path.exists(tmpFile): # move temporary file to the working directory os.rename(tmpFile, outFile) print "Converted to %s" % outFile return
[docs]def process(uuid=None, frame='0', force=False, update=False): """Generate a list of files to process""" if uuid: filenames = egaia_list.listFiles(filter_type='originals', uuid=uuid) else: filenames = egaia_list.listFiles(filter_type='originals') filenames = filenames + egaia_list.listDirs(filter_type='originals') if not filenames: print "Nothing to process!" return for fn in filenames: findRules(fn, frame, force, update) return
[docs]def findRules(fn, frame, force, update): """Find the matching processing rule(s) for a given filename""" (basename, UUID, extension) = egaia_parsefn.parseFilename(fn) if UUID is None: # don't automatically add uuid if missing; this should be done manually print "No uuid tag for this file; ignoring" return ext = extension.lower() mtype = None for fileFormat, extensions in knownFormats.iteritems(): if ext in extensions: mtype = fileFormat break if not mtype: print 'No conversion rule for this file; ignoring' return for (rule, suffix) in outputFormats[mtype]: outFn = '.'.join([basename, rule, UUID, suffix]) # If items are in subdirectories, we want to keep those. outDir = os.path.dirname(fn) outFile = os.path.join(outDir, outFn) # FIXME: This is not a very robust check -- we may have a derivative # that has been moved to a different directory. if os.path.exists(outFile): if not force and not update: print '%s already exists' % outFile continue if update: # check the modification times of the source and destination src_mtime = os.stat(fn).st_mtime dest_mtime = os.stat(outFile).st_mtime # don't copy/link if the destination is newer or the same if src_mtime <= dest_mtime: #print "%s is newer than the source; ignoring" % outFile continue # We will use suffixed temporary file: <FILENAME>.tmp.<EXT> # A temporary file can safely be deleted if the conversion fails # We can't use a prefix here because fn might include directory paths tmpFile = '.'.join([basename, rule, UUID, 'tmp', suffix]) makeDeriv(rule, mtype, fn, UUID, tmpFile, outDir, outFile, basename, frame)
def _cli(args): """egaia derive Generate derivative formats for archival storage and for distribution. Usage: egaia derive --help egaia derive [ --item=ITEM ] [ --frame=N ] [ --force | --update ] """ if not egaia_root.get_root(): exit('Please run "egaia init" to create a new collection here.') frame = args['--frame'] if frame is None: frame = '1' # use input numbering that starts with 1 frame = int(frame) - 1 if args['--item']: process(uuid=egaia_parsefn.getUuid(args['--item']), frame=frame, force=args['--force']) else: process(frame=frame, force=args['--force'], update=args['--update']) return