Source code for egaia.egaia_archivedotorg

# -*- coding: utf-8 -*-

import os

import internetarchive
from qtfaststart import processor as qt
import json

import egaia_root
import egaia_list
import egaia_docx
import egaia_config
import egaia_parsefn
import utils

[docs]def listVideos(uuid=None, new=True): """Get a list of videos from the current collection that have not been uploaded, and matching the uuid if specified. If new is False, we will not filter out the previously uploaded items. """ # list original filenames in the current collection, matching uuid if given filenames = egaia_list.listFiles(filter_type='df-h264', uuid=uuid) # parse the metadata filtered_items = list() items = json.loads(egaia_docx.collectionMetadata(), encoding='utf-8') # get the metadata labels dict fields = dict(egaia_config.printConfig(section='terms')) # filter to videos that don't have remote_embed_url for item in items: item_uuid = item.get(fields['DCTERMS.identifier'])[0] if uuid and not uuid in item_uuid: # "not uuid in item" continue itemtype = item.get(fields['DCTERMS.type'], None) if not itemtype or not 'moving image' in itemtype[0].lower(): # "not moving image" continue remote = item.get(fields['remote_embed_url'], None) # we want: remote is True AND new is False # or: remote is False AND new is True if remote and new is True: # "remote and new" continue if not remote and new is False: # "not remote and not new" continue if remote and not any('archive.org' in i for i in remote): # "not archive.org in remote" continue # retrieve the file path from the list generated above for filename in filenames: if item_uuid and item_uuid in filename: item['filename'] = filename (basename, ext) = os.path.splitext(os.path.basename(filename)) # Force IA to generate an "mp4" derivative for the embed player if ext.lower() == '.mp4': ext = '.mpeg4' # Add "HD" so we can get an SD/HD toggle in the embed player # as the HD source videos have too high a bitrate to be # streamable except over a very fast Internet connection. # See https://blog.archive.org/2013/02/09/new-video-and-audio-player-video-multiple-qualities-related-videos-and-more/ # The IA parser is picky, though -- our h264 videos don't get # processed as such! # Don't use the "df-xxx" tags, as this confuses IA into thinking # these are different videos. item['remote_filename'] = item_uuid + ext break filtered_items.append(item) return filtered_items
[docs]def post(items, upload=True, faststart=False, collection='opensource_movies', dry_run=False): """Upload items to the Internet Archive. Takes a list of metadata dictionaries, corresponding to csv rows, as provided by listVideos(). If upload is not True, the remote metadata will be updated but items will not be re-uploaded.""" archive = egaia_config.getConfig('archive', 'archive_name') ia_noindex = egaia_config.getConfig('archive', 'ia_noindex', boolean=True) # get the metadata labels dict fields = dict(egaia_config.printConfig(section='terms')) for item in items: item_uuid = item.get(fields['DCTERMS.identifier'])[0] print "processing %s..." % item_uuid url = '/'.join([ egaia_config.getConfig('archive', 'archive_url').strip('/'), 'item', item_uuid ]) link = u'Details: <a href="{url}">{url}</a>'.format(url=url) desc = item.get(fields['DCTERMS.description'], []) # Currently CRLF sequences get converted to "<BR>" by IA description = u'<br>\r\n'.join(desc + [link]) meta_dict = dict( title=item.get(fields['DCTERMS.title'])[0], collection=collection, mediatype='movies', contributor=archive, creator=item.get(fields['DCTERMS.creator']), description=description, subject=item.get(fields['DCTERMS.subject']), ) if ia_noindex is True: meta_dict['noindex'] = 'true' if dry_run: print 'item: ' + item_uuid print 'files: ' + item['remote_filename'] + item['filename'] print 'metadata: ' print meta_dict print continue if upload: try: r = internetarchive.upload(item_uuid, files={item['remote_filename']:item['filename']}, metadata=meta_dict, verbose=True ) except: print "Upload failed!" continue updateDocx(item_uuid) else: try: r = internetarchive.modify_metadata(item_uuid, metadata=meta_dict) except: print "Update failed!" continue print "Server response: %s" % r
[docs]def updateDocx(uuid): """Populate the "remote_embed_url" metadata field for an item.""" fields = dict(egaia_config.printConfig(section='terms')) url = u'https://archive.org/embed/%s' % uuid json_str = u'{"%s": "%s"}' % (fields['remote_embed_url'], url) fn = egaia_docx.filter_files(u'*.%s.*' % uuid) egaia_docx.writeDocx(filename=fn[0], append=json_str)
def _cli(args): """egaia archivedotorg Publish videos, along with their Dublin Core metadata, to the Internet Archive and embed the published versions in html pages generated by egaia. This command also allows you to update remote item metadata. Usage: egaia archivedotorg --help egaia archivedotorg ( --upload | --update ) [ --collection=COLLECTION ] [ --item=ITEM ] [ --dry-run ] egaia archivedotorg --update-docx --item=ITEM """ if not egaia_root.get_root(): exit('Please run "egaia init" to create a new collection here.') if args['--collection']: collection = args['--collection'] else: collection = 'opensource_movies' if args['--item'] is not None: args['--item'] = egaia_parsefn.getUuid(args['--item']) if args['--upload']: post( listVideos(uuid=args['--item']), upload=True, collection=collection, dry_run=args['--dry-run'] ) elif args['--update']: post( listVideos(uuid=args['--item'], new=False), upload=False, collection=collection, dry_run=args['--dry-run'] ) elif args['--update-docx']: # FIXME: Just update everything... updateDocx(args['--item'])