# -*- coding: utf-8 -*-
"""
Accepts and handles requests for tasks.
Each of the following runs in its own Thread/Process.
BASICALLY DO A CLIENT/SERVER TO SPAWN PROCESSES
AND THEN A PUBLISH SUBSCRIBE TO RETURN DATA
Accepter:
Receives tasks and requests
Delegates tasks and responds to requests
Tasks are delgated to an engine
Engine:
the engine accepts requests.
the engine immediately responds WHERE it will be ready.
the engine sends a message to the collector saying that something will be ready.
the engine then executes a task.
The engine is given direct access to the data.
Collector:
The collector accepts requests
The collector can respond:
* <ResultContent>
* Results are ready.
* Results are not ready.
* Unknown jobid.
* Error computing results.
* Progress percent.
References:
Simple task farm, with routed replies in pyzmq
http://stackoverflow.com/questions/7809200/implementing-task-farm-messaging-pattern-with-zeromq
https://gist.github.com/minrk/1358832
Notes:
We are essentially goint to be spawning two processes.
We can test these simultaniously using
python -m wbia.web.job_engine job_engine_tester
We can test these separately by first starting the background server
python -m wbia.web.job_engine job_engine_tester --bg
Alternative:
python -m wbia.web.job_engine job_engine_tester --bg --no-engine
python -m wbia.web.job_engine job_engine_tester --bg --only-engine --fg-engine
And then running the forground process
python -m wbia.web.job_engine job_engine_tester --fg
"""
from __future__ import absolute_import, division, print_function, unicode_literals
# if False:
# import os
# os.environ['UTOOL_NOCNN'] = 'True'
import utool as ut
import time
import zmq
import uuid # NOQA
import numpy as np
import shelve
import random
from datetime import datetime, timedelta
import pytz
import flask
from os.path import join, exists, abspath, splitext, basename
from functools import partial
from wbia.control import controller_inject
import threading
print, rrr, profile = ut.inject2(__name__)
CLASS_INJECT_KEY, register_ibs_method = controller_inject.make_ibs_register_decorator(
__name__
)
register_api = controller_inject.get_wbia_flask_api(__name__)
ctx = zmq.Context.instance()
# FIXME: needs to use correct number of ports
URL = 'tcp://127.0.0.1'
NUM_ENGINES = 1
VERBOSE_JOBS = (
ut.get_argflag('--bg') or ut.get_argflag('--fg') or ut.get_argflag('--verbose-jobs')
)
GLOBAL_SHELVE_LOCK = threading.Lock()
TIMESTAMP_FMTSTR = '%Y-%m-%d %H:%M:%S %Z'
TIMESTAMP_TIMEZONE = 'US/Pacific'
JOB_STATUS_CACHE = {}
[docs]def update_proctitle(procname, dbname=None):
try:
import setproctitle
print('CHANGING PROCESS TITLE')
old_title = setproctitle.getproctitle()
print('old_title = %r' % (old_title,))
# new_title = 'IBEIS_' + procname + ' ' + old_title
# new_title = procname + ' ' + old_title
# new_title = 'wbia_zmq_loop'
hostname = ut.get_computer_name()
new_title = 'IBEIS_%s_%s_%s' % (dbname, hostname, procname,)
print('new_title = %r' % (new_title,))
setproctitle.setproctitle(new_title)
except ImportError:
print('pip install setproctitle')
def _get_engine_job_paths(ibs):
shelve_path = ibs.get_shelves_path()
ut.ensuredir(shelve_path)
record_filepath_list = list(ut.iglob(join(shelve_path, '*.pkl')))
return record_filepath_list
def _get_engine_lock_paths(ibs):
shelve_path = ibs.get_shelves_path()
ut.ensuredir(shelve_path)
lock_filepath_list = list(ut.iglob(join(shelve_path, '*.lock')))
return lock_filepath_list
[docs]@register_ibs_method
def initialize_job_manager(ibs):
"""
Starts a background zmq job engine. Initializes a zmq object in this thread
that can talk to the background processes.
Run from the webserver
CommandLine:
python -m wbia.web.job_engine --exec-initialize_job_manager:0
Example:
>>> # DISABLE_DOCTEST
>>> from wbia.web.job_engine import * # NOQA
>>> import wbia
>>> ibs = wbia.opendb(defaultdb='testdb1')
>>> from wbia.web import apis_engine
>>> from wbia.web import job_engine
>>> ibs.load_plugin_module(job_engine)
>>> ibs.load_plugin_module(apis_engine)
>>> ibs.initialize_job_manager()
>>> print('Initializqation success. Now closing')
>>> ibs.close_job_manager()
>>> print('Closing success.')
Example:
>>> # xdoctest: +REQUIRES(--web)
>>> from wbia.web.job_engine import * # NOQA
>>> import wbia
>>> import requests
>>> web_instance = wbia.opendb_bg_web(db='testdb1')
>>> web_port = ibs.get_web_port_via_scan()
>>> if web_port is None:
>>> raise ValueError('IA web server is not running on any expected port')
>>> baseurl = 'http://127.0.1.1:%s' % (web_port, )
>>> _payload = {'image_attrs_list': [], 'annot_attrs_list': []}
>>> payload = ut.map_dict_vals(ut.to_json, _payload)
>>> resp1 = requests.post(baseurl + '/api/test/helloworld/?f=b', data=payload)
>>> #resp2 = requests.post(baseurl + '/api/image/json/', data=payload)
>>> #print(resp2)
>>> web_instance.terminate()
>>> #json_dict = resp2.json()
>>> #text = json_dict['response']
>>> #print(text)
"""
ibs.job_manager = ut.DynStruct()
use_static_ports = False
if ut.get_argflag('--web-deterministic-ports'):
use_static_ports = True
if ut.get_argflag('--fg'):
ibs.job_manager.reciever = JobBackend(use_static_ports=True)
else:
ibs.job_manager.reciever = JobBackend(use_static_ports=use_static_ports)
ibs.job_manager.reciever.initialize_background_processes(
dbdir=ibs.get_dbdir(), containerized=ibs.containerized,
)
# Delete any leftover locks from before
lock_filepath_list = _get_engine_lock_paths(ibs)
print('Deleting %d leftover engine locks' % (len(lock_filepath_list),))
for lock_filepath in lock_filepath_list:
ut.delete(lock_filepath)
ibs.job_manager.jobiface = JobInterface(
0, ibs.job_manager.reciever.port_dict, ibs=ibs
)
ibs.job_manager.jobiface.initialize_client_thread()
# Wait until the collector becomes live
while 0 and True:
result = ibs.get_job_status(-1)
print('result = %r' % (result,))
if result['status'] == 'ok':
break
ibs.job_manager.jobiface.queue_interrupted_jobs()
# import wbia
# #dbdir = '/media/raid/work/testdb1'
# from wbia.web import app
# web_port = ibs.get_web_port_via_scan()
# if web_port is None:
# raise ValueError('IA web server is not running on any expected port')
# proc = ut.spawn_background_process(app.start_from_wbia, ibs, port=web_port)
[docs]@register_ibs_method
def close_job_manager(ibs):
# if hasattr(ibs, 'job_manager') and ibs.job_manager is not None:
# pass
del ibs.job_manager.reciever
del ibs.job_manager.jobiface
ibs.job_manager = None
[docs]@register_ibs_method
@register_api('/api/engine/job/', methods=['GET', 'POST'], __api_plural_check__=False)
def get_job_id_list(ibs):
"""
Web call that returns the list of job ids
CommandLine:
# Run Everything together
python -m wbia.web.job_engine --exec-get_job_status
# Start job queue in its own process
python -m wbia.web.job_engine job_engine_tester --bg
# Start web server in its own process
./main.py --web --fg
pass
# Run foreground process
python -m wbia.web.job_engine --exec-get_job_status:0 --fg
Example:
>>> # xdoctest: +REQUIRES(--web)
>>> from wbia.web.job_engine import * # NOQA
>>> import wbia
>>> web_ibs = wbia.opendb_bg_web('testdb1') # , domain='http://52.33.105.88')
>>> # Test get status of a job id that does not exist
>>> response = web_ibs.send_wbia_request('/api/engine/job/', jobid='badjob')
>>> web_ibs.terminate2()
"""
status = ibs.job_manager.jobiface.get_job_id_list()
jobid_list = status['jobid_list']
return jobid_list
[docs]@register_ibs_method
@register_api(
'/api/engine/job/status/', methods=['GET', 'POST'], __api_plural_check__=False
)
def get_job_status(ibs, jobid=None):
"""
Web call that returns the status of a job
Returns one of:
received - job has been received, but not ingested yet
accepted - job has been accepted (validated)
queued - job has been transferred to the engine queue
working - job is being worked on by the engine
publishing - job is done on the engine, pushing results to collector
completed | exception - job is complete or has an error
CommandLine:
# Run Everything together
python -m wbia.web.job_engine --exec-get_job_status
# Start job queue in its own process
python -m wbia.web.job_engine job_engine_tester --bg
# Start web server in its own process
./main.py --web --fg
pass
# Run foreground process
python -m wbia.web.job_engine --exec-get_job_status:0 --fg
Example:
>>> # xdoctest: +REQUIRES(--web)
>>> from wbia.web.job_engine import * # NOQA
>>> import wbia
>>> web_ibs = wbia.opendb_bg_web('testdb1') # , domain='http://52.33.105.88')
>>> # Test get status of a job id that does not exist
>>> response = web_ibs.send_wbia_request('/api/engine/job/status/', jobid='badjob')
>>> web_ibs.terminate2()
"""
if jobid is None:
status = ibs.job_manager.jobiface.get_job_status_dict()
else:
status = ibs.job_manager.jobiface.get_job_status(jobid)
return status
[docs]@register_ibs_method
@register_api('/api/engine/job/result/', methods=['GET', 'POST'])
def get_job_result(ibs, jobid):
"""
Web call that returns the result of a job
"""
result = ibs.job_manager.jobiface.get_job_result(jobid)
return result
[docs]@register_ibs_method
@register_api('/api/engine/job/result/wait/', methods=['GET', 'POST'])
def wait_for_job_result(ibs, jobid, timeout=10, freq=0.1):
ibs.job_manager.jobiface.wait_for_job_result(jobid, timeout=timeout, freq=freq)
result = ibs.job_manager.jobiface.get_unpacked_result(jobid)
return result
def _get_random_open_port():
port = random.randint(1024, 49151)
while not ut.is_local_port_open(port):
port = random.randint(1024, 49151)
assert ut.is_local_port_open(port)
return port
[docs]def job_engine_tester():
"""
CommandLine:
python -m wbia.web.job_engine --exec-job_engine_tester
python -b -m wbia.web.job_engine --exec-job_engine_tester
python -m wbia.web.job_engine job_engine_tester
python -m wbia.web.job_engine job_engine_tester --bg
python -m wbia.web.job_engine job_engine_tester --fg
Example:
>>> # SCRIPT
>>> from wbia.web.job_engine import * # NOQA
>>> job_engine_tester()
"""
_init_signals()
# now start a few clients, and fire off some requests
client_id = np.random.randint(1000)
reciever = JobBackend(use_static_ports=True)
jobiface = JobInterface(client_id, reciever.port_dict)
from wbia.init import sysres
if ut.get_argflag('--bg'):
dbdir = sysres.get_args_dbdir(
defaultdb='cache', allow_newdir=False, db=None, dbdir=None
)
reciever.initialize_background_processes(dbdir)
print('[testzmq] parent process is looping forever')
while True:
time.sleep(1)
elif ut.get_argflag('--fg'):
jobiface.initialize_client_thread()
else:
dbdir = sysres.get_args_dbdir(
defaultdb='cache', allow_newdir=False, db=None, dbdir=None
)
reciever.initialize_background_processes(dbdir)
jobiface.initialize_client_thread()
# Foreground test script
print('... waiting for jobs')
if ut.get_argflag('--cmd'):
ut.embed()
# jobiface.queue_job()
else:
print('[test] ... emit test1')
callback_url = None
callback_method = None
args = (1,)
jobid1 = jobiface.queue_job('helloworld', callback_url, callback_method, *args)
jobiface.wait_for_job_result(jobid1)
jobid_list = []
args = ([1], [3, 4, 5])
kwargs = dict(cfgdict={'K': 1})
identify_jobid = jobiface.queue_job(
'query_chips_simple_dict', callback_url, callback_method, *args, **kwargs
)
for jobid in jobid_list:
jobiface.wait_for_job_result(jobid)
jobiface.wait_for_job_result(identify_jobid)
print('FINISHED TEST SCRIPT')
[docs]class JobBackend(object):
def __init__(self, **kwargs):
# self.num_engines = 3
self.num_engines = NUM_ENGINES
self.engine_queue_proc = None
self.engine_procs = None
self.collect_queue_proc = None
self.collect_proc = None
# --
only_engine = ut.get_argflag('--only-engine')
self.spawn_collector = not only_engine
self.spawn_engine = not ut.get_argflag('--no-engine')
self.fg_engine = ut.get_argflag('--fg-engine')
self.spawn_queue = not only_engine
# Find ports
self.port_dict = None
self._initialize_job_ports(**kwargs)
print('JobBackend ports:')
ut.print_dict(self.port_dict)
def __del__(self):
if VERBOSE_JOBS:
print('Cleaning up job backend')
if self.engine_procs is not None:
for i in self.engine_procs:
i.terminate()
if self.engine_queue_proc is not None:
self.engine_queue_proc.terminate()
if self.collect_proc is not None:
self.collect_proc.terminate()
if self.collect_queue_proc is not None:
self.collect_queue_proc.terminate()
if VERBOSE_JOBS:
print('Killed external procs')
def _initialize_job_ports(self, use_static_ports=False, static_root=51381):
# _portgen = functools.partial(six.next, itertools.count(51381))
key_list = [
'engine_url1',
'engine_url2',
'collect_url1',
'collect_url2',
# 'collect_pushpull_url',
]
# Get ports
if use_static_ports:
port_list = range(static_root, static_root + len(key_list))
else:
port_list = []
while len(port_list) < len(key_list):
port = _get_random_open_port()
if port not in port_list:
port_list.append(port)
port_list = sorted(port_list)
# Assign ports
assert len(key_list) == len(port_list)
self.port_dict = {
key: '%s:%d' % (URL, port) for key, port in list(zip(key_list, port_list))
}
[docs] def initialize_background_processes(
self, dbdir=None, containerized=False, thread=True
):
print = partial(ut.colorprint, color='fuchsia')
# if VERBOSE_JOBS:
print('Initialize Background Processes')
def _spawner(func, *args, **kwargs):
if thread:
# mp.set_start_method('spawn')
_spawner_func_ = ut.spawn_background_daemon_thread
else:
_spawner_func_ = ut.spawn_background_process
proc = _spawner_func_(func, *args, **kwargs)
assert proc.is_alive(), 'proc (%s) died too soon' % (ut.get_funcname(func,))
return proc
if self.spawn_queue:
self.engine_queue_proc = _spawner(engine_queue_loop, self.port_dict)
self.collect_queue_proc = _spawner(collect_queue_loop, self.port_dict)
if self.spawn_collector:
self.collect_proc = _spawner(
collector_loop, self.port_dict, dbdir, containerized
)
if self.spawn_engine:
if self.fg_engine:
print('ENGINE IS IN DEBUG FOREGROUND MODE')
# Spawn engine in foreground process
assert self.num_engines == 1, 'fg engine only works with one engine'
engine_loop(0, self.port_dict, dbdir)
assert False, 'should never see this'
else:
# Normal case
self.engine_procs = [
_spawner(engine_loop, i, self.port_dict, dbdir, containerized)
for i in range(self.num_engines)
]
# Check if online
# wait for processes to spin up
if self.spawn_queue:
assert self.engine_queue_proc.is_alive(), 'engine died too soon'
assert self.collect_queue_proc.is_alive(), 'collector queue died too soon'
if self.spawn_collector:
assert self.collect_proc.is_alive(), 'collector died too soon'
if self.spawn_engine:
for engine in self.engine_procs:
assert engine.is_alive(), 'engine died too soon'
[docs]def get_shelve_lock_filepath(shelve_filepath):
shelve_lock_filepath = '%s.lock' % (shelve_filepath,)
return shelve_lock_filepath
[docs]def touch_shelve_lock_file(shelve_filepath):
shelve_lock_filepath = get_shelve_lock_filepath(shelve_filepath)
assert not exists(shelve_lock_filepath)
ut.touch(shelve_lock_filepath, verbose=False)
assert exists(shelve_lock_filepath)
[docs]def delete_shelve_lock_file(shelve_filepath):
shelve_lock_filepath = get_shelve_lock_filepath(shelve_filepath)
assert exists(shelve_lock_filepath)
ut.delete(shelve_lock_filepath, verbose=False)
assert not exists(shelve_lock_filepath)
[docs]def wait_for_shelve_lock_file(shelve_filepath, timeout=600):
shelve_lock_filepath = get_shelve_lock_filepath(shelve_filepath)
start_time = time.time()
while exists(shelve_lock_filepath):
current_time = time.time()
elapsed = current_time - start_time
if elapsed >= timeout:
return False
time.sleep(1)
if int(elapsed) % 5 == 0:
print('Waiting for %0.02f seconds for lock so far' % (elapsed,))
return True
[docs]def get_shelve_value(shelve_filepath, key):
wait_for_shelve_lock_file(shelve_filepath)
with GLOBAL_SHELVE_LOCK:
wait_for_shelve_lock_file(shelve_filepath)
touch_shelve_lock_file(shelve_filepath)
value = None
try:
with shelve.open(shelve_filepath, 'r') as shelf:
value = shelf.get(key)
except Exception:
pass
delete_shelve_lock_file(shelve_filepath)
return value
[docs]def set_shelve_value(shelve_filepath, key, value):
wait_for_shelve_lock_file(shelve_filepath)
with GLOBAL_SHELVE_LOCK:
wait_for_shelve_lock_file(shelve_filepath)
touch_shelve_lock_file(shelve_filepath)
flag = False
try:
with shelve.open(shelve_filepath) as shelf:
shelf[key] = value
flag = True
except Exception:
pass
delete_shelve_lock_file(shelve_filepath)
return flag
[docs]def get_shelve_filepaths(ibs, jobid):
shelve_path = ibs.get_shelves_path()
shelve_input_filepath = abspath(join(shelve_path, '%s.input.shelve' % (jobid,)))
shelve_output_filepath = abspath(join(shelve_path, '%s.output.shelve' % (jobid,)))
return shelve_input_filepath, shelve_output_filepath
[docs]class JobInterface(object):
def __init__(jobiface, id_, port_dict, ibs=None):
jobiface.id_ = id_
jobiface.ibs = ibs
jobiface.verbose = 2 if VERBOSE_JOBS else 1
jobiface.port_dict = port_dict
print('JobInterface ports:')
ut.print_dict(jobiface.port_dict)
# def init(jobiface):
# # Starts several new processes
# jobiface.initialize_background_processes()
# # Does not create a new process, but connects sockets on this process
# jobiface.initialize_client_thread()
[docs] def initialize_client_thread(jobiface):
"""
Creates a ZMQ object in this thread. This talks to background processes.
"""
print = partial(ut.colorprint, color='blue')
if jobiface.verbose:
print('Initializing JobInterface')
jobiface.engine_deal_sock = ctx.socket(zmq.DEALER)
jobiface.engine_deal_sock.setsockopt_string(
zmq.IDENTITY, 'client%s.engine.DEALER' % (jobiface.id_,)
)
jobiface.engine_deal_sock.connect(jobiface.port_dict['engine_url1'])
if jobiface.verbose:
print('connect engine_url1 = %r' % (jobiface.port_dict['engine_url1'],))
jobiface.collect_deal_sock = ctx.socket(zmq.DEALER)
jobiface.collect_deal_sock.setsockopt_string(
zmq.IDENTITY, 'client%s.collect.DEALER' % (jobiface.id_,)
)
jobiface.collect_deal_sock.connect(jobiface.port_dict['collect_url1'])
if jobiface.verbose:
print('connect collect_url1 = %r' % (jobiface.port_dict['collect_url1'],))
[docs] def queue_interrupted_jobs(jobiface):
import tqdm
ibs = jobiface.ibs
if ibs is not None:
MAX_ATTEMPTS = 20
ARCHIVE_DAYS = 14
timezone = pytz.timezone(TIMESTAMP_TIMEZONE)
now = datetime.now(timezone)
now = now.replace(microsecond=0)
now = now.replace(second=0)
now = now.replace(minute=0)
now = now.replace(hour=0)
archive_delta = timedelta(days=ARCHIVE_DAYS)
archive_date = now - archive_delta
archive_timestamp = archive_date.strftime(TIMESTAMP_FMTSTR)
shelve_path = ibs.get_shelves_path()
shelve_path = shelve_path.rstrip('/')
shelve_archive_path = '%s_ARCHIVE' % (shelve_path,)
ut.ensuredir(shelve_archive_path)
record_filepath_list = _get_engine_job_paths(ibs)
restart_jobcounter_list = []
restart_jobid_list = []
restart_request_list = []
completed_jobcounter = 0
num_registered, num_archived, num_suppressed, num_corrupted = 0, 0, 0, 0
print('Reloading %d engine jobs...' % (len(record_filepath_list),))
for record_filepath in tqdm.tqdm(record_filepath_list):
jobid = splitext(basename(record_filepath))[0]
# Load the engine record
record = ut.load_cPkl(record_filepath, verbose=False)
# Load the record info
engine_request = record.get('request', None)
attempts = record.get('attempts', 0)
completed = record.get('completed', False)
# Check status
suppressed = attempts >= MAX_ATTEMPTS
corrupted = engine_request is None
# Load metadata
shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(
ibs, jobid
)
metadata = get_shelve_value(shelve_input_filepath, 'metadata')
if metadata is None:
print('Missing metadata...corrupted')
corrupted = True
archive_flag = False
if not corrupted:
jobcounter = metadata.get('jobcounter', None)
times = metadata.get('times', {})
if jobcounter is None:
print('Missing jobcounter...corrupted')
corrupted = True
job_age = None
if not corrupted and completed:
completed_timestamp = times.get('completed', None)
if completed_timestamp is not None:
try:
archive_elapsed = calculate_timedelta(
completed_timestamp, archive_timestamp
)
job_age = archive_elapsed[-1]
archive_flag = job_age > 0
except Exception:
args = (
completed_timestamp,
archive_timestamp,
)
print(
'[job_engine] Could not determine archive status!\n\tCompleted: %r\n\tArchive: %r'
% args
)
if archive_flag:
with ut.Indenter('[client %d] ' % (jobiface.id_)):
color = 'brightmagenta'
print_ = partial(ut.colorprint, color=color)
print_('ARCHIVING JOB (AGE: %d SECONDS)' % (job_age,))
job_scr_filepath_list = list(
ut.iglob(join(shelve_path, '%s*' % (jobid,)))
)
for job_scr_filepath in job_scr_filepath_list:
job_dst_filepath = job_scr_filepath.replace(
shelve_path, shelve_archive_path
)
ut.copy(
job_scr_filepath,
job_dst_filepath,
overwrite=True,
) # ut.copy allows for overwrite, ut.move does not
ut.delete(job_scr_filepath)
num_archived += 1
if not archive_flag and True not in [completed, suppressed, corrupted]:
with ut.Indenter('[client %d] ' % (jobiface.id_)):
color = 'brightblue' if attempts == 0 else 'brightred'
print_ = partial(ut.colorprint, color=color)
print_(
'RESTARTING FAILED JOB FROM RESTART (ATTEMPT %d)'
% (attempts + 1,)
)
print_(ut.repr3(record_filepath))
# print_(ut.repr3(record))
times = metadata.get('times', {})
received = times['received']
engine_request['restart_jobid'] = jobid
engine_request['restart_jobcounter'] = jobcounter
engine_request['restart_received'] = received
restart_jobcounter_list.append(jobcounter)
restart_jobid_list.append(jobid)
restart_request_list.append(engine_request)
record['attempts'] = attempts + 1
ut.save_cPkl(record_filepath, record, verbose=False)
# We may have suppressed this for being corrupted
if not archive_flag and True in [completed, suppressed, corrupted]:
if completed:
status = 'completed'
elif suppressed:
status = 'suppressed'
num_suppressed += 1
else:
status = 'corrupted'
num_corrupted += 1
reply_notify = {
'jobid': jobid,
'status': status,
'action': 'register',
}
jobiface.collect_deal_sock.send_json(reply_notify)
reply = jobiface.collect_deal_sock.recv_json()
# jobcounter_ = reply['jobcounter']
jobid_ = reply['jobid']
# assert jobcounter_ == jobcounter
assert jobid_ == jobid
if not corrupted and jobcounter is not None:
completed_jobcounter = max(completed_jobcounter, jobcounter)
num_registered += 1
print('Registered %d jobs...' % (num_registered,))
print('\t %d suppressed jobs' % (num_suppressed,))
print('\t %d corrupted jobs' % (num_corrupted,))
print('Archived %d jobs...' % (num_archived,))
# Update the jobcounter to be up to date
update_notify = {
'__set_jobcounter__': completed_jobcounter,
}
print('Updating completed job counter: %r' % (update_notify,))
jobiface.engine_deal_sock.send_json(update_notify)
reply = jobiface.engine_deal_sock.recv_json()
jobcounter_ = reply['jobcounter']
assert jobcounter_ == completed_jobcounter
print('Re-sending %d engine jobs...' % (len(restart_jobcounter_list),))
index_list = np.argsort(restart_jobcounter_list)
zipped = list(
zip(restart_jobcounter_list, restart_jobid_list, restart_request_list)
)
zipped = ut.take(zipped, index_list)
for jobcounter, jobid, engine_request in tqdm.tqdm(zipped):
jobiface.engine_deal_sock.send_json(engine_request)
reply = jobiface.engine_deal_sock.recv_json()
jobcounter_ = reply['jobcounter']
jobid_ = reply['jobid']
assert jobcounter_ == jobcounter
assert jobid_ == jobid
[docs] def queue_job(
jobiface, action, callback_url=None, callback_method=None, *args, **kwargs
):
r"""
IBEIS:
This is just a function that lives in the main thread and ships off
a job.
FIXME: I do not like having callback_url and callback_method specified
like this with args and kwargs. If these must be there then
they should be specified first, or
THE PREFERED OPTION IS
args and kwargs should not be specified without the * syntax
The client - sends messages, and receives replies after they
have been processed by the
"""
# NAME: job_client
with ut.Indenter('[client %d] ' % (jobiface.id_)):
print = partial(ut.colorprint, color='blue')
if jobiface.verbose >= 1:
print('----')
request = {}
try:
if flask.request:
request = {
'endpoint': flask.request.path,
'function': flask.request.endpoint,
'input': flask.request.processed,
}
except RuntimeError:
pass
engine_request = {
'action': action,
'args': args,
'kwargs': kwargs,
'callback_url': callback_url,
'callback_method': callback_method,
'request': request,
'restart_jobid': None,
'restart_jobcounter': None,
'restart_received': None,
}
if jobiface.verbose >= 2:
print('Queue job: %s' % (engine_request))
# Send request to job
jobiface.engine_deal_sock.send_json(engine_request)
reply_notify = jobiface.engine_deal_sock.recv_json()
jobid = reply_notify['jobid']
ibs = jobiface.ibs
if ibs is not None:
shelve_path = ibs.get_shelves_path()
ut.ensuredir(shelve_path)
record_filename = '%s.pkl' % (jobid,)
record_filepath = join(shelve_path, record_filename)
record = {
'request': engine_request,
'attempts': 0,
'completed': False,
}
ut.save_cPkl(record_filepath, record, verbose=False)
# Release memor
action = None
args = None
kwargs = None
callback_url = None
callback_method = None
request = None
engine_request = None
return jobid
[docs] def get_job_id_list(jobiface):
with ut.Indenter('[client %d] ' % (jobiface.id_)):
print = partial(ut.colorprint, color='teal')
if False: # jobiface.verbose >= 1:
print('----')
print('Request list of job ids')
pair_msg = dict(action='job_id_list')
# CALLS: collector_request_status
jobiface.collect_deal_sock.send_json(pair_msg)
reply = jobiface.collect_deal_sock.recv_json()
return reply
[docs] def get_job_status(jobiface, jobid):
with ut.Indenter('[client %d] ' % (jobiface.id_)):
print = partial(ut.colorprint, color='teal')
if jobiface.verbose >= 1:
print('----')
print('Request status of jobid=%r' % (jobid,))
pair_msg = dict(action='job_status', jobid=jobid)
# CALLS: collector_request_status
jobiface.collect_deal_sock.send_json(pair_msg)
reply = jobiface.collect_deal_sock.recv_json()
return reply
[docs] def get_job_status_dict(jobiface):
with ut.Indenter('[client %d] ' % (jobiface.id_)):
print = partial(ut.colorprint, color='teal')
if False: # jobiface.verbose >= 1:
print('----')
print('Request list of job ids')
pair_msg = dict(action='job_status_dict')
# CALLS: collector_request_status
jobiface.collect_deal_sock.send_json(pair_msg)
reply = jobiface.collect_deal_sock.recv_json()
return reply
[docs] def get_job_result(jobiface, jobid):
with ut.Indenter('[client %d] ' % (jobiface.id_)):
if jobiface.verbose >= 1:
print = partial(ut.colorprint, color='teal')
print('----')
print('Request result of jobid=%r' % (jobid,))
pair_msg = dict(action='job_result', jobid=jobid)
# CALLER: collector_request_result
jobiface.collect_deal_sock.send_json(pair_msg)
reply = jobiface.collect_deal_sock.recv_json()
return reply
[docs] def get_unpacked_result(jobiface, jobid):
reply = jobiface.get_job_result(jobid)
json_result = reply['json_result']
try:
result = ut.from_json(json_result)
except TypeError as ex:
ut.printex(ex, keys=['json_result'], iswarning=True)
result = json_result
except Exception as ex:
ut.printex(ex, 'Failed to unpack result', keys=['json_result'])
result = reply['json_result']
# Release raw JSON result
json_result = None
return result
[docs] def wait_for_job_result(jobiface, jobid, timeout=10, freq=0.1):
t = ut.Timer(verbose=False)
t.tic()
while True:
reply = jobiface.get_job_status(jobid)
if reply['jobstatus'] == 'completed':
return
elif reply['jobstatus'] == 'exception':
result = jobiface.get_unpacked_result(jobid)
# raise Exception(result)
print('Exception occured in engine')
return result
elif reply['jobstatus'] == 'working':
pass
elif reply['jobstatus'] == 'unknown':
pass
else:
raise Exception('Unknown jobstatus=%r' % (reply['jobstatus'],))
reply = None # Release memory
time.sleep(freq)
if timeout is not None and t.toc() > timeout:
raise Exception('Timeout')
[docs]def make_queue_loop(name='collect'):
"""
Standard queue loop
Args:
name (None): (default = None)
"""
assert name is not None, 'must name queue'
queue_name = name + '_queue'
loop_name = queue_name + '_loop'
def queue_loop(port_dict):
iface1, iface2 = port_dict['collect_url1'], port_dict['collect_url2']
print = partial(ut.colorprint, color='green')
update_proctitle(queue_name)
with ut.Indenter('[%s] ' % (queue_name,)):
if VERBOSE_JOBS:
print('Init make_queue_loop: name=%r' % (name,))
# bind the client dealer to the queue router
rout_sock = ctx.socket(zmq.ROUTER)
rout_sock.setsockopt_string(zmq.IDENTITY, 'queue.' + name + '.' + 'ROUTER')
rout_sock.bind(iface1)
if VERBOSE_JOBS:
print('bind %s_url1 = %r' % (name, iface1,))
# bind the server router to the queue dealer
deal_sock = ctx.socket(zmq.DEALER)
deal_sock.setsockopt_string(zmq.IDENTITY, 'queue.' + name + '.' + 'DEALER')
deal_sock.bind(iface2)
if VERBOSE_JOBS:
print('bind %s_url2 = %r' % (name, iface2,))
try:
if 1:
# the remainder of this function can be entirely replaced with
zmq.device(zmq.QUEUE, rout_sock, deal_sock)
else:
# but this shows what is really going on:
poller = zmq.Poller()
poller.register(rout_sock, zmq.POLLIN)
poller.register(deal_sock, zmq.POLLIN)
while True:
evts = dict(poller.poll())
# poll() returns a list of tuples [(socket, evt), (socket, evt)]
# dict(poll()) turns this into {socket:evt, socket:evt}
if rout_sock in evts:
msg = rout_sock.recv_multipart()
# ROUTER sockets prepend the identity of the jobiface,
# for routing replies
if VERBOSE_JOBS:
print('ROUTER relayed %r via DEALER' % (msg,))
deal_sock.send_multipart(msg)
msg = None
if deal_sock in evts:
msg = deal_sock.recv_multipart()
if VERBOSE_JOBS:
print('DEALER relayed %r via ROUTER' % (msg,))
rout_sock.send_multipart(msg)
msg = None
except KeyboardInterrupt:
print('Caught ctrl+c in collector loop. Gracefully exiting')
if VERBOSE_JOBS:
print('Exiting %s' % (loop_name,))
ut.set_funcname(queue_loop, loop_name)
return queue_loop
collect_queue_loop = make_queue_loop(name='collect')
[docs]def engine_queue_loop(port_dict):
"""
Specialized queue loop
"""
# Flow of information tags:
# NAME: engine_queue
update_proctitle('engine_queue_loop')
iface1, iface2 = port_dict['engine_url1'], port_dict['engine_url2']
name = 'engine'
queue_name = name + '_queue'
loop_name = queue_name + '_loop'
print = partial(ut.colorprint, color='red')
with ut.Indenter('[%s] ' % (queue_name,)):
print('Init specialized make_queue_loop: name=%r' % (name,))
# bind the client dealer to the queue router
rout_sock = ctx.socket(zmq.ROUTER)
rout_sock.setsockopt_string(
zmq.IDENTITY, 'special_queue.' + name + '.' + 'ROUTER'
)
rout_sock.bind(iface1)
if VERBOSE_JOBS:
print('bind %s_url2 = %r' % (name, iface1,))
# bind the server router to the queue dealer
deal_sock = ctx.socket(zmq.DEALER)
deal_sock.setsockopt_string(
zmq.IDENTITY, 'special_queue.' + name + '.' + 'DEALER'
)
deal_sock.bind(iface2)
if VERBOSE_JOBS:
print('bind %s_url2 = %r' % (name, iface2,))
collect_deal_sock = ctx.socket(zmq.DEALER)
collect_deal_sock.setsockopt_string(zmq.IDENTITY, queue_name + '.collect.DEALER')
collect_deal_sock.connect(port_dict['collect_url1'])
if VERBOSE_JOBS:
print('connect collect_url1 = %r' % (port_dict['collect_url1'],))
# but this shows what is really going on:
poller = zmq.Poller()
poller.register(rout_sock, zmq.POLLIN)
poller.register(deal_sock, zmq.POLLIN)
# always start at 0
global_jobcounter = 0
try:
while True:
evts = dict(poller.poll())
if rout_sock in evts:
# CALLER: job_client
idents, engine_request = rcv_multipart_json(
rout_sock, num=1, print=print
)
set_jobcounter = engine_request.get('__set_jobcounter__', None)
if set_jobcounter is not None:
global_jobcounter = set_jobcounter
reply_notify = {
'jobcounter': global_jobcounter,
}
print(
'... notifying client that jobcounter was updated to %d'
% (global_jobcounter,)
)
# RETURNS: job_client_return
send_multipart_json(rout_sock, idents, reply_notify)
continue
# jobid = 'jobid-%04d' % (jobcounter,)
jobid = '%s' % (uuid.uuid4(),)
jobcounter = global_jobcounter + 1
received = _timestamp()
action = engine_request['action']
args = engine_request['args']
kwargs = engine_request['kwargs']
callback_url = engine_request['callback_url']
callback_method = engine_request['callback_method']
request = engine_request['request']
restart_jobid = engine_request.get('restart_jobid', None)
restart_jobcounter = engine_request.get('restart_jobcounter', None)
restart_received = engine_request.get('restart_received', None)
if restart_jobid is not None:
'[RESTARTING] Replacing jobid=%s with previous restart_jobid=%s' % (
jobid,
restart_jobid,
)
jobid = restart_jobid
if restart_jobcounter is not None:
'[RESTARTING] Replacing jobcounter=%s with previous restart_jobcounter=%s' % (
jobcounter,
restart_jobcounter,
)
jobcounter = restart_jobcounter
print('Creating jobid %r (counter %d)' % (jobid, jobcounter,))
if restart_received is not None:
received = restart_received
######################################################################
# Status: Received (Notify Collector)
# Reply immediately with a new jobid
reply_notify = {
'jobid': jobid,
'jobcounter': jobcounter,
'status': 'received',
'action': 'notification',
}
if VERBOSE_JOBS:
print('...notifying collector about new job')
# CALLS: collector_notify
collect_deal_sock.send_json(reply_notify)
######################################################################
# Status: Received (Notify Client)
if VERBOSE_JOBS:
print('... notifying client that job was accepted')
# RETURNS: job_client_return
send_multipart_json(rout_sock, idents, reply_notify)
######################################################################
# Status: Metadata
# Reply immediately with a new jobid
metadata_notify = {
'jobid': jobid,
'metadata': {
'jobcounter': jobcounter,
'action': action,
'args': args,
'kwargs': kwargs,
'callback_url': callback_url,
'callback_method': callback_method,
'request': request,
'times': {
'received': received,
'started': None,
'updated': None,
'completed': None,
'runtime': None,
'turnaround': None,
'runtime_sec': None,
'turnaround_sec': None,
},
},
'action': 'metadata',
}
if VERBOSE_JOBS:
print('...notifying collector about job metadata')
# CALLS: collector_notify
collect_deal_sock.send_json(metadata_notify)
######################################################################
# Status: Accepted (Metadata Processed)
# We have been accepted, let's update the global_jobcounter
global_jobcounter = jobcounter
# Reply immediately with a new jobid
reply_notify = {
'jobid': jobid,
'status': 'accepted',
'action': 'notification',
}
if VERBOSE_JOBS:
print('...notifying collector about new job')
# CALLS: collector_notify
collect_deal_sock.send_json(reply_notify)
######################################################################
# Status: Queueing on the Engine
assert 'jobid' not in engine_request
engine_request['jobid'] = jobid
if VERBOSE_JOBS:
print('... notifying backend engine to start')
# CALL: engine_
send_multipart_json(deal_sock, idents, engine_request)
# Release
idents = None
engine_request = None
######################################################################
# Status: Queued
queued_notify = {
'jobid': jobid,
'status': 'queued',
'action': 'notification',
}
if VERBOSE_JOBS:
print('...notifying collector that job was queued')
# CALLS: collector_notify
collect_deal_sock.send_json(queued_notify)
if deal_sock in evts:
pass
except KeyboardInterrupt:
print('Caught ctrl+c in %s queue. Gracefully exiting' % (loop_name,))
if VERBOSE_JOBS:
print('Exiting %s queue' % (loop_name,))
[docs]def engine_loop(id_, port_dict, dbdir, containerized):
r"""
IBEIS:
This will be part of a worker process with its own IBEISController
instance.
Needs to send where the results will go and then publish the results there.
The engine_loop - receives messages, performs some action, and sends a reply,
preserving the leading two message parts as routing identities
"""
# NAME: engine_
# CALLED_FROM: engine_queue
import wbia
# base_print = print # NOQA
print = partial(ut.colorprint, color='darkred')
with ut.Indenter('[engine %d] ' % (id_)):
if VERBOSE_JOBS:
print('Initializing engine')
print('connect engine_url2 = %r' % (port_dict['engine_url2'],))
assert dbdir is not None
engine_rout_sock = ctx.socket(zmq.ROUTER)
engine_rout_sock.connect(port_dict['engine_url2'])
collect_deal_sock = ctx.socket(zmq.DEALER)
collect_deal_sock.setsockopt_string(zmq.IDENTITY, 'engine.collect.DEALER')
collect_deal_sock.connect(port_dict['collect_url1'])
if VERBOSE_JOBS:
print('connect collect_url1 = %r' % (port_dict['collect_url1'],))
print('engine is initialized')
try:
while True:
# ibs = wbia.opendb(dbdir=dbdir, use_cache=False, web=False, force_serial=True)
ibs = wbia.opendb(dbdir=dbdir, use_cache=False, web=False)
update_proctitle('engine_loop', dbname=ibs.dbname)
idents, engine_request = rcv_multipart_json(engine_rout_sock, print=print)
action = engine_request['action']
jobid = engine_request['jobid']
args = engine_request['args']
kwargs = engine_request['kwargs']
callback_url = engine_request['callback_url']
callback_method = engine_request['callback_method']
# Notify start working
reply_notify = {
'jobid': jobid,
'status': 'working',
'action': 'notification',
}
collect_deal_sock.send_json(reply_notify)
engine_result = on_engine_request(ibs, jobid, action, args, kwargs)
exec_status = engine_result['exec_status']
# Notify start working
reply_notify = {
'jobid': jobid,
'status': 'publishing',
'action': 'notification',
}
collect_deal_sock.send_json(reply_notify)
# Store results in the collector
collect_request = dict(
idents=idents,
action='store',
jobid=jobid,
engine_result=engine_result,
callback_url=callback_url,
callback_method=callback_method,
)
# if VERBOSE_JOBS:
print(
'...done working. pushing result to collector for jobid %s' % (jobid,)
)
# CALLS: collector_store
collect_deal_sock.send_json(collect_request)
# Notify start working
reply_notify = {
'jobid': jobid,
'status': exec_status,
'action': 'notification',
}
collect_deal_sock.send_json(reply_notify)
# We no longer need the engine result, and can clear it's memory
engine_request = None
engine_result = None
collect_request = None
# Release the IBEIS controller for each job, hopefully freeing memory
ibs = None
# Explicitly try to release GPU memory
try:
import torch
torch.cuda.empty_cache()
except Exception:
pass
# Explicitly release Python memory
try:
import gc
gc.collect()
except Exception:
pass
except KeyboardInterrupt:
print('Caught ctrl+c in engine loop. Gracefully exiting')
# ----
if VERBOSE_JOBS:
print('Exiting engine loop')
[docs]def on_engine_request(ibs, jobid, action, args, kwargs):
""" Run whenever the engine recieves a message """
# Start working
if VERBOSE_JOBS:
print('starting job=%r' % (jobid,))
# Map actions to IBEISController calls here
if action == 'helloworld':
def helloworld(time_=0, *args, **kwargs):
time.sleep(time_)
retval = ('HELLO time_=%r ' % (time_,)) + ut.repr2((args, kwargs))
return retval
action_func = helloworld
else:
# check for ibs func
action_func = getattr(ibs, action)
if VERBOSE_JOBS:
print('resolving action=%r to wbia function=%r' % (action, action_func))
try:
key = '__jobid__'
assert key not in kwargs
kwargs[key] = jobid
result = action_func(*args, **kwargs)
exec_status = 'completed'
except Exception as ex:
result = ut.formatex(ex, keys=['jobid'], tb=True)
result = ut.strip_ansi(result)
exec_status = 'exception'
json_result = ut.to_json(result)
result = None # Clear any used memory
engine_result = dict(exec_status=exec_status, json_result=json_result, jobid=jobid,)
return engine_result
[docs]def collector_loop(port_dict, dbdir, containerized):
"""
Service that stores completed algorithm results
"""
import wbia
print = partial(ut.colorprint, color='yellow')
with ut.Indenter('[collect] '):
collect_rout_sock = ctx.socket(zmq.ROUTER)
collect_rout_sock.setsockopt_string(zmq.IDENTITY, 'collect.ROUTER')
collect_rout_sock.connect(port_dict['collect_url2'])
if VERBOSE_JOBS:
print('connect collect_url2 = %r' % (port_dict['collect_url2'],))
ibs = wbia.opendb(dbdir=dbdir, use_cache=False, web=False)
update_proctitle('collector_loop', dbname=ibs.dbname)
shelve_path = ibs.get_shelves_path()
ut.ensuredir(shelve_path)
collector_data = {}
try:
while True:
# several callers here
# CALLER: collector_notify
# CALLER: collector_store
# CALLER: collector_request_status
# CALLER: collector_request_metadata
# CALLER: collector_request_result
idents, collect_request = rcv_multipart_json(
collect_rout_sock, print=print
)
try:
reply = on_collect_request(
ibs,
collect_request,
collector_data,
shelve_path,
containerized=containerized,
)
except Exception as ex:
import traceback
print(ut.repr3(collect_request))
ut.printex(ex, 'ERROR in collection')
print(traceback.format_exc())
reply = {}
send_multipart_json(collect_rout_sock, idents, reply)
idents = None
collect_request = None
# Explicitly release Python memory
try:
import gc
gc.collect()
except Exception:
pass
except KeyboardInterrupt:
print('Caught ctrl+c in collector loop. Gracefully exiting')
if VERBOSE_JOBS:
print('Exiting collector')
def _timestamp():
timezone = pytz.timezone(TIMESTAMP_TIMEZONE)
now = datetime.now(timezone)
timestamp = now.strftime(TIMESTAMP_FMTSTR)
return timestamp
[docs]def invalidate_global_cache(jobid):
global JOB_STATUS_CACHE
JOB_STATUS_CACHE.pop(jobid, None)
[docs]def get_collector_shelve_filepaths(collector_data, jobid):
if jobid is None:
return None, None
shelve_input_filepath = collector_data.get(jobid, {}).get('input', None)
shelve_output_filepath = collector_data.get(jobid, {}).get('output', None)
return shelve_input_filepath, shelve_output_filepath
[docs]def convert_to_date(timestamp):
TIMESTAMP_FMTSTR_ = ' '.join(TIMESTAMP_FMTSTR.split(' ')[:-1])
timestamp_ = ' '.join(timestamp.split(' ')[:-1])
timestamp_date = datetime.strptime(timestamp_, TIMESTAMP_FMTSTR_)
return timestamp_date
[docs]def calculate_timedelta(start, end):
start_date = convert_to_date(start)
end_date = convert_to_date(end)
delta = end_date - start_date
total_seconds = int(delta.total_seconds())
total_seconds_ = total_seconds
hours = total_seconds_ // (60 * 60)
total_seconds_ -= hours * 60 * 60
minutes = total_seconds_ // 60
total_seconds_ -= minutes * 60
seconds = total_seconds_
return hours, minutes, seconds, total_seconds
[docs]def on_collect_request(
ibs, collect_request, collector_data, shelve_path, containerized=False
):
""" Run whenever the collector recieves a message """
import requests
action = collect_request.get('action', None)
jobid = collect_request.get('jobid', None)
status = collect_request.get('status', None)
reply = {
'status': 'ok',
'jobid': jobid,
}
# Ensure we have a collector record for the jobid
if jobid is not None:
if jobid not in collector_data:
collector_data[jobid] = {
'status': None,
'input': None,
'output': None,
}
runtime_lock_filepath = join(shelve_path, '%s.lock' % (jobid,))
else:
runtime_lock_filepath = None
args = get_collector_shelve_filepaths(collector_data, jobid)
collector_shelve_input_filepath, collector_shelve_output_filepath = args
if action == 'notification':
assert None not in [jobid, runtime_lock_filepath]
collector_data[jobid]['status'] = status
print('Notify %s' % ut.repr3(collector_data[jobid]))
invalidate_global_cache(jobid)
if status == 'received':
ut.touch(runtime_lock_filepath)
if status == 'completed':
if exists(runtime_lock_filepath):
ut.delete(runtime_lock_filepath)
# Mark the engine request as finished
record_filename = '%s.pkl' % (jobid,)
record_filepath = join(shelve_path, record_filename)
record = ut.load_cPkl(record_filepath, verbose=False)
record['completed'] = True
ut.save_cPkl(record_filepath, record, verbose=False)
record = None
# Update relevant times in the shelf
metadata = get_shelve_value(collector_shelve_input_filepath, 'metadata')
if metadata is not None:
times = metadata.get('times', {})
times['updated'] = _timestamp()
if status == 'working':
times['started'] = _timestamp()
if status == 'completed':
times['completed'] = _timestamp()
# Calculate runtime
received = times.get('received', None)
started = times.get('started', None)
completed = times.get('completed', None)
runtime = times.get('runtime', None)
turnaround = times.get('turnaround', None)
if None not in [started, completed] and runtime is None:
hours, minutes, seconds, total_seconds = calculate_timedelta(
started, completed
)
args = (
hours,
minutes,
seconds,
total_seconds,
)
times['runtime'] = '%d hours %d min. %s sec. (total: %d sec.)' % args
times['runtime_sec'] = total_seconds
if None not in [received, completed] and turnaround is None:
hours, minutes, seconds, total_seconds = calculate_timedelta(
received, completed
)
args = (
hours,
minutes,
seconds,
total_seconds,
)
times['turnaround'] = '%d hours %d min. %s sec. (total: %d sec.)' % args
times['turnaround_sec'] = total_seconds
metadata['times'] = times
set_shelve_value(collector_shelve_input_filepath, 'metadata', metadata)
metadata = None # Release memory
elif action == 'register':
assert None not in [jobid]
invalidate_global_cache(jobid)
shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(ibs, jobid)
metadata = get_shelve_value(shelve_input_filepath, 'metadata')
engine_result = get_shelve_value(shelve_output_filepath, 'result')
if status == 'completed':
# Ensure we can read the data we expect out of a completed job
if None in [metadata, engine_result]:
status = 'corrupted'
collector_data[jobid] = {
'status': status,
'input': shelve_input_filepath,
'output': shelve_output_filepath,
}
print('Register %s' % ut.repr3(collector_data[jobid]))
metadata, engine_result = None, None # Release memory
elif action == 'metadata':
invalidate_global_cache(jobid)
# From the Engine
metadata = collect_request.get('metadata', None)
shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(ibs, jobid)
collector_data[jobid]['input'] = shelve_input_filepath
set_shelve_value(shelve_input_filepath, 'metadata', metadata)
print('Stored Metadata %s' % ut.repr3(collector_data[jobid]))
metadata = None # Release memory
elif action == 'store':
invalidate_global_cache(jobid)
# From the Engine
engine_result = collect_request.get('engine_result', None)
callback_url = collect_request.get('callback_url', None)
callback_method = collect_request.get('callback_method', None)
# Get the engine result jobid
jobid = engine_result.get('jobid', jobid)
assert jobid in collector_data
shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(ibs, jobid)
collector_data[jobid]['output'] = shelve_output_filepath
set_shelve_value(shelve_output_filepath, 'result', engine_result)
print('Stored Result %s' % ut.repr3(collector_data[jobid]))
engine_result = None # Release memory
if callback_url is not None:
if containerized:
callback_url = callback_url.replace('://localhost/', '://wildbook:8080/')
if callback_method is None:
callback_method = 'POST'
callback_method = callback_method.upper()
message = 'callback_method %r unsupported' % (callback_method,)
assert callback_method in ['POST', 'GET', 'PUT'], message
try:
data_dict = {'jobid': jobid}
args = (
callback_url,
callback_method,
data_dict,
)
print(
'Attempting job completion callback to %r\n\tHTTP Method: %r\n\tData Payload: %r'
% args
)
# Perform callback
if callback_method == 'POST':
response = requests.post(callback_url, data=data_dict)
elif callback_method == 'GET':
response = requests.get(callback_url, params=data_dict)
elif callback_method == 'PUT':
response = requests.put(callback_url, data=data_dict)
else:
raise RuntimeError()
# Check response
try:
text = unicode(response.text).encode('utf-8')
except Exception:
text = None
args = (
response,
text,
)
print('Callback completed...\n\tResponse: %r\n\tText: %r' % args)
except Exception:
print('Callback FAILED!')
elif action == 'job_status':
reply['jobstatus'] = collector_data.get(jobid, {}).get('status', 'unknown')
elif action == 'job_status_dict':
json_result = {}
for jobid in collector_data:
if jobid in JOB_STATUS_CACHE:
job_status_data = JOB_STATUS_CACHE.get(jobid, None)
else:
status = collector_data[jobid]['status']
shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(
ibs, jobid
)
metadata = get_shelve_value(shelve_input_filepath, 'metadata')
cache = True
if metadata is None:
if status in ['corrupted']:
status = 'corrupted'
elif status in ['suppressed']:
status = 'suppressed'
elif status in ['completed']:
status = 'corrupted'
else:
# status = 'pending'
cache = False
metadata = {
'jobcounter': -1,
}
times = metadata.get('times', {})
request = metadata.get('request', {})
# Support legacy jobs
if request is None:
request = {}
job_status_data = {
'status': status,
'jobcounter': metadata.get('jobcounter', None),
'action': metadata.get('action', None),
'endpoint': request.get('endpoint', None),
'function': request.get('function', None),
'time_received': times.get('received', None),
'time_started': times.get('started', None),
'time_runtime': times.get('runtime', None),
'time_updated': times.get('updated', None),
'time_completed': times.get('completed', None),
'time_turnaround': times.get('turnaround', None),
'time_runtime_sec': times.get('runtime_sec', None),
'time_turnaround_sec': times.get('turnaround_sec', None),
}
if cache:
JOB_STATUS_CACHE[jobid] = job_status_data
json_result[jobid] = job_status_data
reply['json_result'] = json_result
metadata = None # Release memory
elif action == 'job_id_list':
reply['jobid_list'] = sorted(list(collector_data.keys()))
elif action == 'job_input':
if jobid not in collector_data:
reply['status'] = 'invalid'
metadata = None
else:
metadata = get_shelve_value(collector_shelve_input_filepath, 'metadata')
if metadata is None:
reply['status'] = 'corrupted'
reply['json_result'] = metadata
metadata = None # Release memory
elif action == 'job_result':
if jobid not in collector_data:
reply['status'] = 'invalid'
result = None
else:
status = collector_data[jobid]['status']
engine_result = get_shelve_value(collector_shelve_output_filepath, 'result')
if engine_result is None:
if status in ['corrupted']:
status = 'corrupted'
elif status in ['suppressed']:
status = 'suppressed'
elif status in ['completed']:
status = 'corrupted'
else:
# status = 'pending'
pass
reply['status'] = status
result = None
else:
reply['status'] = engine_result['exec_status']
json_result = engine_result['json_result']
result = ut.from_json(json_result)
reply['json_result'] = result
engine_result = None # Release memory
else:
# Other
print('...error unknown action=%r' % (action,))
reply['status'] = 'error'
return reply
[docs]def send_multipart_json(sock, idents, reply):
""" helper """
reply_json = ut.to_json(reply).encode('utf-8')
reply = None
multi_reply = idents + [reply_json]
sock.send_multipart(multi_reply)
[docs]def rcv_multipart_json(sock, num=2, print=print):
""" helper """
# note that the first two parts will be ['Controller.ROUTER', 'Client.<id_>']
# these are needed for the reply to propagate up to the right client
multi_msg = sock.recv_multipart()
if VERBOSE_JOBS:
print('----')
print('RCV Json: %s' % (ut.repr2(multi_msg, truncate=True),))
idents = multi_msg[:num]
request_json = multi_msg[num]
request = ut.from_json(request_json)
request_json = None
multi_msg = None
return idents, request
def _on_ctrl_c(signal, frame):
print('[wbia.zmq] Caught ctrl+c')
print('[wbia.zmq] sys.exit(0)')
import sys
sys.exit(0)
def _init_signals():
import signal
signal.signal(signal.SIGINT, _on_ctrl_c)
if __name__ == '__main__':
"""
CommandLine:
python -m wbia.web.job_engine
python -m wbia.web.job_engine --allexamples
python -m wbia.web.job_engine --allexamples --noface --nosrc
"""
import multiprocessing
multiprocessing.freeze_support() # for win32
import utool as ut # NOQA
ut.doctest_funcs()