Source code for eqcorrscan.utils.pre_processing

"""
Utilities module whose functions are designed to do the basic \
processing of the data using obspy modules (which also rely on scipy and \
numpy).

:copyright:
    Calum Chamberlain, Chet Hopp.

:license:
    GNU Lesser General Public License, Version 3
    (https://www.gnu.org/copyleft/lesser.html)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


def _check_daylong(tr):
    """
    Check the data quality of the daylong file.
    Check to see \
    that the day isn't just zeros, with large steps, if it is then the \
    resampling will hate it.

    :type tr: obspy.core.trace.Trace
    :param tr: Trace to check if the data are daylong.

    :return qual: bool

    .. rubric:: Example

    >>> from obspy import read
    >>> from eqcorrscan.utils.pre_processing import _check_daylong
    >>> st = read('eqcorrscan/tests/test_data/WAV/TEST_/' +
    ...           '2013-09-01-0410-35.DFDPC_024_00')
    >>> _check_daylong(st[0])
    True
    """
    import numpy as np
    if len(tr.data) - len(np.nonzero(tr.data)) < 0.5 * len(tr.data):
        qual = False
    else:
        qual = True
    return qual


[docs]def shortproc(st, lowcut, highcut, filt_order, samp_rate, debug=0, parallel=False, num_cores=False, starttime=None, endtime=None): r"""Basic function to bandpass and downsample. Works in place on data. This is employed to ensure all parts of the \ data are processed in the same way. :type st: obspy.core.stream.Stream :param st: Stream to process :type highcut: float :param highcut: High cut for bandpass in Hz :type lowcut: float :param lowcut: Low cut for bandpass in Hz :type filt_order: int :param filt_order: Number of corners for bandpass filter :type samp_rate: float :param samp_rate: Sampling rate desired in Hz :type debug: int :param debug: Debug flag from 0-5, higher numbers = more output :type parallel: bool :param parallel: Set to True to process traces in parallel, for small \ numbers of traces this is often slower than serial processing, \ defaults to False :type num_cores: int :param num_cores: Control the number of cores for parallel processing, \ if set to False then this will use all the cores. :type starttime: obspy.core.UTCDateTime :param starttime: Desired data start time, will trim to this before \ processing :type endtime: obspy.core.UTCDateTime :param endtime: Desired data end time, will trim to this before \ processing :return: obspy.Stream .. note:: Will convert channel names to two characters long. .. warning:: If you intend to use this for processing templates you \ should consider how resampling will effect your cross-correlations. \ Minor differences in resampling between day-long files (which you \ are likely to use for continuous detection) and shorter files will \ reduce your cross-correlations! .. rubric:: Example, bandpass >>> from obspy import read >>> from eqcorrscan.utils.pre_processing import shortproc >>> st = read('eqcorrscan/tests/test_data/WAV/TEST_/' + ... '2013-09-01-0410-35.DFDPC_024_00') >>> st = shortproc(st=st, lowcut=2, highcut=9, filt_order=3, samp_rate=20, ... debug=0, parallel=True, num_cores=2) >>> print(st[0]) AF.LABE..SZ | 2013-09-01T04:10:35.700000Z - 2013-09-01T04:12:05.650000Z | 20.0 Hz, 1800 samples .. rubric:: Example, low-pass >>> from obspy import read >>> from eqcorrscan.utils.pre_processing import shortproc >>> st = read('eqcorrscan/tests/test_data/WAV/TEST_/' + ... '2013-09-01-0410-35.DFDPC_024_00') >>> st = shortproc(st=st, lowcut=None, highcut=9, filt_order=3, ... samp_rate=20, debug=0) >>> print(st[0]) AF.LABE..SZ | 2013-09-01T04:10:35.700000Z - 2013-09-01T04:12:05.650000Z | 20.0 Hz, 1800 samples .. rubric:: Example, high-pass >>> from obspy import read >>> from eqcorrscan.utils.pre_processing import shortproc >>> st = read('eqcorrscan/tests/test_data/WAV/TEST_/' + ... '2013-09-01-0410-35.DFDPC_024_00') >>> st = shortproc(st=st, lowcut=2, highcut=None, filt_order=3, ... samp_rate=20, debug=0) >>> print(st[0]) AF.LABE..SZ | 2013-09-01T04:10:35.700000Z - 2013-09-01T04:12:05.650000Z | 20.0 Hz, 1800 samples """ from multiprocessing import Pool, cpu_count from obspy import Stream, Trace if isinstance(st, Trace): tracein = True st = Stream(st) else: tracein = False # Add sanity check for filter if highcut and highcut >= 0.5 * samp_rate: raise IOError('Highcut must be lower than the nyquist') if debug > 4: parallel = False if starttime and endtime: for tr in st: tr.trim(starttime, endtime) print(len(tr)) if len(tr.data) == ((endtime - starttime) * tr.stats.sampling_rate) + 1: tr.data = tr.data[1:len(tr.data)] elif starttime: for tr in st: tr.trim(starttime=starttime) elif endtime: for tr in st: tr.trim(endtime=endtime) if parallel: if not num_cores: num_cores = cpu_count() pool = Pool(processes=num_cores) results = [pool.apply_async(process, (tr,), {'lowcut': lowcut, 'highcut': highcut, 'filt_order': filt_order, 'samp_rate': samp_rate, 'debug': debug, 'starttime': False, 'full_day': False}) for tr in st] pool.close() stream_list = [p.get() for p in results] pool.join() st = Stream(stream_list) else: for tr in st: process(tr=tr, lowcut=lowcut, highcut=highcut, filt_order=filt_order, samp_rate=samp_rate, debug=debug, starttime=False, full_day=False) if tracein: st.merge() return st[0] return st
[docs]def dayproc(st, lowcut, highcut, filt_order, samp_rate, starttime, debug=0, parallel=True, num_cores=False): """ Wrapper for dayproc to parallel multiple traces in a stream. Works in place on data. This is employed to ensure all parts of the data \ are processed in the same way. :type st: obspy.core.stream.Stream :param st: Stream to process (can be trace) :type highcut: float :param highcut: High cut in Hz for bandpass :type lowcut: float :type lowcut: Low cut in Hz for bandpass :type filt_order: int :param filt_order: Corners for bandpass :type samp_rate: float :param samp_rate: Desired sampling rate in Hz :type debug: int :param debug: Debug output level from 0-5, higher numbers = more output :type starttime: obspy.core.utcdatetime.UTCDateTime :param starttime: Desired start of trace :type parallel: bool :param parallel: Set to True to process traces in parallel, this is often \ faster than serial processing of traces: defaults to True :type num_cores: int :param num_cores: Control the number of cores for parallel processing, \ if set to False then this will use all the cores. :return: obspy.Stream .. note:: Will convert channel names to two characters long. .. rubric:: Example, bandpass >>> import obspy >>> if int(obspy.__version__.split('.')[0]) >= 1: ... from obspy.clients.fdsn import Client ... else: ... from obspy.fdsn import Client >>> from obspy import UTCDateTime >>> from eqcorrscan.utils.pre_processing import dayproc >>> client = Client('GEONET') >>> t1 = UTCDateTime(2012, 3, 26) >>> t2 = t1 + 86400 >>> bulk_info = [('NZ', 'FOZ', '10', 'HHE', t1, t2), ... ('NZ', 'FOZ', '10', 'HHE', t1, t2)] >>> st = client.get_waveforms_bulk(bulk_info) >>> st = dayproc(st=st, lowcut=2, highcut=9, filt_order=3, samp_rate=20, ... starttime=t1, debug=0, parallel=True, num_cores=2) >>> print(st[0]) NZ.FOZ.10.HE | 2012-03-25T23:59:59.998393Z - 2012-03-26T23:59:59.948393Z | 20.0 Hz, 1728000 samples .. rubric:: Example, low-pass >>> import obspy >>> if int(obspy.__version__.split('.')[0]) >= 1: ... from obspy.clients.fdsn import Client ... else: ... from obspy.fdsn import Client >>> from obspy import UTCDateTime >>> from eqcorrscan.utils.pre_processing import dayproc >>> client = Client('GEONET') >>> t1 = UTCDateTime(2012, 3, 26) >>> t2 = t1 + 86400 >>> bulk_info = [('NZ', 'FOZ', '10', 'HHE', t1, t2), ... ('NZ', 'FOZ', '10', 'HHE', t1, t2)] >>> st = client.get_waveforms_bulk(bulk_info) >>> st = dayproc(st=st, lowcut=None, highcut=9, filt_order=3, samp_rate=20, ... starttime=t1, debug=0, parallel=True, num_cores=2) >>> print(st[0]) NZ.FOZ.10.HE | 2012-03-25T23:59:59.998393Z - 2012-03-26T23:59:59.948393Z | 20.0 Hz, 1728000 samples .. rubric:: Example, high-pass >>> import obspy >>> if int(obspy.__version__.split('.')[0]) >= 1: ... from obspy.clients.fdsn import Client ... else: ... from obspy.fdsn import Client >>> from obspy import UTCDateTime >>> from eqcorrscan.utils.pre_processing import dayproc >>> client = Client('GEONET') >>> t1 = UTCDateTime(2012, 3, 26) >>> t2 = t1 + 86400 >>> bulk_info = [('NZ', 'FOZ', '10', 'HHE', t1, t2), ... ('NZ', 'FOZ', '10', 'HHE', t1, t2)] >>> st = client.get_waveforms_bulk(bulk_info) >>> st = dayproc(st=st, lowcut=2, highcut=None, filt_order=3, samp_rate=20, ... starttime=t1, debug=0, parallel=True, num_cores=2) >>> print(st[0]) NZ.FOZ.10.HE | 2012-03-25T23:59:59.998393Z - 2012-03-26T23:59:59.948393Z | 20.0 Hz, 1728000 samples """ from multiprocessing import Pool, cpu_count from obspy import Stream, Trace # Add sanity check for filter if isinstance(st, Trace): st = Stream(st) tracein = True else: tracein = False if highcut and highcut >= 0.5 * samp_rate: raise IOError('Highcut must be lower than the nyquist') if debug > 4: parallel = False if parallel: if not num_cores: num_cores = cpu_count() pool = Pool(processes=num_cores) results = [pool.apply_async(process, (tr,), {'lowcut': lowcut, 'highcut': highcut, 'filt_order': filt_order, 'samp_rate': samp_rate, 'debug': debug, 'starttime': starttime, 'full_day': True}) for tr in st] pool.close() stream_list = [p.get() for p in results] pool.join() st = Stream(stream_list) else: for tr in st: process(tr=tr, lowcut=lowcut, highcut=highcut, filt_order=filt_order, samp_rate=samp_rate, debug=debug, starttime=starttime, full_day=True) if tracein: st.merge() return st[0] return st
[docs]def process(tr, lowcut, highcut, filt_order, samp_rate, debug, starttime=False, full_day=False, seisan=True): r"""Basic function to process data, usually called by dayproc or shortproc. Functionally, this will bandpass, downsample and check headers and length \ of trace to ensure files start at the start of a day and are daylong. Works in place on data. This is employed to ensure all parts of the data \ are processed in the same way. .. note:: Usually this function is called via dayproc or shortproc. :type tr: obspy.core.trace.Trace :param tr: Trace to process :type highcut: float :param highcut: High cut in Hz, if set to None and lowcut is set, will \ use a highpass filter. :type lowcut: float :type lowcut: Low cut in Hz, if set to None and highcut is set, will use \ a lowpass filter. :type filt_order: int :param filt_order: Number of corners for filter. :type samp_rate: float :param samp_rate: Desired sampling rate in Hz :type debug: int :param debug: Debug output level from 0-5, higher numbers = more output :type starttime: obspy.core.utcdatetime.UTCDateTime :param starttime: Desired start of trace :type full_day: bool :param full_day: Whether to expect, and enforce a full day of data or not. :type seisan: bool :param seisan: Whether channels are named like seisan channels (which are \ two letters rather than three) - defaults to True. :return: obspy.Stream .. note:: Will convert channel names to two characters long. """ import warnings from obspy.signal.filter import bandpass, lowpass, highpass # Add sanity check if highcut and highcut >= 0.5 * samp_rate: raise IOError('Highcut must be lower than the nyquist') # Define the start-time if starttime: day = starttime.date else: day = tr.stats.starttime.date if debug >= 2: print('Working on: '+tr.stats.station+'.'+tr.stats.channel) if debug >= 5: tr.plot() # Do a brute force quality check qual = _check_daylong(tr) if not qual: msg = ("Data have more zeros than actual data, please check the raw", " data set-up and manually sort it: " + tr.stats.station + "." + tr.stats.channel) raise ValueError(msg) tr = tr.detrend('simple') # Detrend data before filtering if debug > 0: print('I have '+str(len(tr.data))+' data points for ' + tr.stats.station+'.'+tr.stats.channel+' before processing') # Sanity check to ensure files are daylong if float(tr.stats.npts / tr.stats.sampling_rate) != 86400.0\ and full_day: if debug >= 2: print('Data for '+tr.stats.station+'.'+tr.stats.channel + ' are not of daylong length, will zero pad') # Use obspy's trim function with zero padding tr = tr.trim(starttime, starttime + 86400, pad=True, fill_value=0, nearest_sample=True) # If there is one sample too many after this remove the first one # by convention if len(tr.data) == (86400 * tr.stats.sampling_rate) + 1: tr.data = tr.data[1:len(tr.data)] if not tr.stats.sampling_rate * 86400 == tr.stats.npts: raise ValueError('Data are not daylong for '+tr.stats.station + '.'+tr.stats.channel) print('I now have '+str(len(tr.data)) + ' data points after enforcing day length') # Check sampling rate and resample if tr.stats.sampling_rate != samp_rate: if debug >= 2: print('Resampling') tr.resample(samp_rate) # Filtering section tr = tr.detrend('simple') # Detrend data again before filtering if highcut and lowcut: if debug >= 2: print('Bandpassing') tr.data = bandpass(tr.data, lowcut, highcut, tr.stats.sampling_rate, filt_order, True) elif highcut: if debug >= 2: print('Lowpassing') tr.data = lowpass(tr.data, highcut, tr.stats.sampling_rate, filt_order, True) elif lowcut: if debug >= 2: print('Highpassing') tr.data = highpass(tr.data, lowcut, tr.stats.sampling_rate, filt_order, True) else: warnings.warn('No filters applied') # Account for two letter channel names in s-files and therefore templates if seisan: tr.stats.channel = tr.stats.channel[0]+tr.stats.channel[-1] # Sanity check the time header if tr.stats.starttime.day != day and full_day: warnings.warn("Time headers do not match expected date: " + str(tr.stats.starttime)) # Sanity check to ensure files are daylong if float(tr.stats.npts / tr.stats.sampling_rate) != 86400.0 and full_day: if debug >= 2: print('Data for '+tr.stats.station+'.'+tr.stats.channel + ' is not of daylong length, will zero pad') # Use obspy's trim function with zero padding tr = tr.trim(starttime, starttime+86400, pad=True, fill_value=0, nearest_sample=True) # If there is one sample too many after this remove the last one # by convention if len(tr.data) == (86400 * tr.stats.sampling_rate) + 1: tr.data = tr.data[1:len(tr.data)] if not tr.stats.sampling_rate * 86400 == tr.stats.npts: raise ValueError('Data are not daylong for '+tr.stats.station + '.'+tr.stats.channel) # Final visual check for debug if debug > 4: tr.plot() return tr
if __name__ == "__main__": import doctest doctest.testmod()