"""Additional helper functions dealing with transient-CW F(t0,tau) maps.
See Prix, Giampanis & Messenger (PRD 84, 023007, 2011):
https://arxiv.org/abs/1104.1704
for the algorithm in general
and Keitel & Ashton (CQG 35, 205003, 2018):
https://arxiv.org/abs/1805.05652
for a detailed discussion of the GPU implementation.
"""
import numpy as np
import os
import logging
from time import time
# optional imports
import importlib as imp
def _optional_import(modulename, shorthand=None):
"""
Import a module/submodule only if it's available.
using importlib instead of __import__
because the latter doesn't handle sub.modules
Also including a special check to fail more gracefully
when CUDA_DEVICE is set to too high a number.
"""
if shorthand is None:
shorthand = modulename
shorthandbit = ""
else:
shorthandbit = " as " + shorthand
try:
globals()[shorthand] = imp.import_module(modulename)
logging.debug("Successfully imported module %s%s." % (modulename, shorthandbit))
success = True
except ImportError:
logging.debug("Failed to import module {:s}.".format(modulename))
success = False
return success
[docs]class pyTransientFstatMap:
"""
Simplified object class for a F(t0,tau) F-stat map.
This is based on LALSuite's transientFstatMap_t type,
replacing the gsl matrix with a numpy array.
Here, `t0` is a transient start time,
`tau` is a transient duration parameter,
and `F(t0,tau)` is the F-statistic (not 2F)! evaluated
for a signal with those parameters
(and an implicit window function, which is not stored inside this object).
The 'map' covers a range of different `(t0,tau)` pairs.
Attributes
----------
F_mn: np.ndarray
2D array of F values (not 2F!).
maxF: float
Maximum of F (not 2F!) over the array.
t0_ML: float
Maximum likelihood estimate of the transient start time `t0`.
tau_ML: float
Maximum likelihood estimate of the transient duration `tau`.
"""
def __init__(self, N_t0Range, N_tauRange):
"""
The class can be initialized with the following:
Parameters
----------
N_t0Range: int
Number of `t0` values covered.
N_tauRange: int
Number of `tau` values covered.
"""
self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
# Initializing maxF to a negative value ensures
# that we always update at least once and hence return
# sane t0_d_ML, tau_d_ML
# even if there is only a single bin where F=0 happens.
self.maxF = float(-1.0)
self.t0_ML = float(0.0)
self.tau_ML = float(0.0)
fstatmap_versions = {
"lal": lambda multiFstatAtoms, windowRange: getattr(
lalpulsar, "ComputeTransientFstatMap"
)(multiFstatAtoms, windowRange, False),
"pycuda": lambda multiFstatAtoms, windowRange: pycuda_compute_transient_fstat_map(
multiFstatAtoms, windowRange
),
}
"""Dictionary of the actual callable transient F-stat map functions this module supports.
Actual runtime variability depends on the corresponding external modules
being available.
"""
[docs]def init_transient_fstat_map_features(wantCuda=False, cudaDeviceName=None):
"""Initialization of available modules (or 'features') for computing transient F-stat maps.
Currently, two implementations are supported and checked for
through the `_optional_import()` method:
1. `lal`: requires both `lal` and `lalpulsar` packages to be importable.
2. `pycuda`: requires the `pycuda` package to be importable
along with its modules
`driver`, `gpuarray`, `tools` and `compiler`.
Parameters
----------
wantCuda: bool
Only if this is True and it was possible to import pycuda,
a CUDA device context is created and returned.
cudaDeviceName: str or None
Request a CUDA device with this name.
Partial matches are allowed.
Returns
-------
features: dict
A dictionary of available method names, to match `fstatmap_versions`.
Each key's value is set to `True` only if
all required modules are importable on this system.
gpu_context: pycuda.driver.Context or None
A CUDA device context object, if assigned.
"""
features = {}
have_lal = _optional_import("lal")
have_lalpulsar = _optional_import("lalpulsar")
features["lal"] = have_lal and have_lalpulsar
# import GPU features
have_pycuda = _optional_import("pycuda")
have_pycuda_drv = _optional_import("pycuda.driver", "drv")
have_pycuda_gpuarray = _optional_import("pycuda.gpuarray", "gpuarray")
have_pycuda_tools = _optional_import("pycuda.tools", "cudatools")
have_pycuda_compiler = _optional_import("pycuda.compiler", "cudacomp")
features["pycuda"] = (
have_pycuda
and have_pycuda_drv
and have_pycuda_gpuarray
and have_pycuda_tools
and have_pycuda_compiler
)
logging.debug("Got the following features for transient F-stat maps:")
logging.debug(features)
if wantCuda and features["pycuda"]:
logging.debug("CUDA version: " + ".".join(map(str, drv.get_version())))
drv.init()
logging.debug(
"Starting with default pyCUDA context,"
" then checking all available devices..."
)
try:
context0 = pycuda.tools.make_default_context()
except pycuda._driver.LogicError as e:
if e.message == "cuDeviceGet failed: invalid device ordinal":
devn = int(os.environ["CUDA_DEVICE"])
raise RuntimeError(
"Requested CUDA device number {} exceeds"
" number of available devices!"
" Please change through environment"
" variable $CUDA_DEVICE.".format(devn)
)
else:
raise pycuda._driver.LogicError(e.message)
num_gpus = drv.Device.count()
logging.debug("Found {} CUDA device(s).".format(num_gpus))
devices = []
devnames = np.empty(num_gpus, dtype="S32")
for n in range(num_gpus):
devn = drv.Device(n)
devices.append(devn)
devnames[n] = devn.name().replace(" ", "-").replace("_", "-")
logging.debug(
"device {}: model: {}, RAM: {}MB".format(
n, devnames[n], devn.total_memory() / (2.0 ** 20)
)
)
if "CUDA_DEVICE" in os.environ:
devnum0 = int(os.environ["CUDA_DEVICE"])
else:
devnum0 = 0
matchbit = ""
if cudaDeviceName:
# allow partial matches in device names
devmatches = [
devidx
for devidx, devname in enumerate(devnames)
if cudaDeviceName in devname
]
if len(devmatches) == 0:
context0.detach()
raise RuntimeError(
'Requested CUDA device "{}" not found.'
" Available devices: [{}]".format(
cudaDeviceName, ",".join(devnames)
)
)
else:
devnum = devmatches[0]
if len(devmatches) > 1:
logging.warning(
'Found {} CUDA devices matching name "{}".'
" Choosing first one with index {}.".format(
len(devmatches), cudaDeviceName, devnum
)
)
os.environ["CUDA_DEVICE"] = str(devnum)
matchbit = '(matched to user request "{}")'.format(cudaDeviceName)
elif "CUDA_DEVICE" in os.environ:
devnum = int(os.environ["CUDA_DEVICE"])
else:
devnum = 0
devn = devices[devnum]
logging.info(
"Choosing CUDA device {},"
" of {} devices present: {}{}...".format(
devnum, num_gpus, devn.name(), matchbit
)
)
if devnum == devnum0:
gpu_context = context0
else:
context0.pop()
gpu_context = pycuda.tools.make_default_context()
gpu_context.push()
_print_GPU_memory_MB("Available")
else:
gpu_context = None
return features, gpu_context
[docs]def call_compute_transient_fstat_map(
version, features, multiFstatAtoms=None, windowRange=None
):
"""Call a version of the ComputeTransientFstatMap function.
This checks if the requested `version` is available,
and if so, executes the computation of a transient F-statistic map
over the `windowRange`.
Parameters
----------
version: str
Name of the method to call
(currently supported: 'lal' or 'pycuda').
features: dict
Dictionary of available features,
as obtained from `init_transient_fstat_map_features()`.
multiFstatAtoms: lalpulsar.MultiFstatAtomVector or None
The time-dependent F-stat atoms previously computed by `ComputeFstat`.
windowRange: lalpulsar.transientWindowRange_t or None
The structure defining the transient parameters.
Returns
-------
FstatMap: pyTransientFstatMap or lalpulsar.transientFstatMap_t
The output of the called function,
including the evaluated transient F-statistic map
over the windowRange.
timingFstatMap: float
Execution time of the called function.
"""
if version in fstatmap_versions:
if features[version]:
time0 = time()
FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
timingFstatMap = time() - time0
else:
raise Exception(
"Required module(s) for transient F-stat map"
' method "{}" not available!'.format(version)
)
else:
raise Exception(
'Transient F-stat map method "{}"' " not implemented!".format(version)
)
return FstatMap, timingFstatMap
[docs]def reshape_FstatAtomsVector(atomsVector):
"""Make a dictionary of ndarrays out of an F-stat atoms 'vector' structure.
Parameters
----------
atomsVector: lalpulsar.FstatAtomVector
The atoms in a 'vector'-like structure:
iterating over timestamps as the higher hierarchical level,
with a set of 'atoms' quantities defined at each timestamp.
Returns
-------
atomsDict: dict
A dictionary with an entry for each quantity,
which then is a 1D ndarray over timestamps for that one quantity.
"""
numAtoms = atomsVector.length
atomsDict = {}
atom_fieldnames = [
"timestamp",
"Fa_alpha",
"Fb_alpha",
"a2_alpha",
"ab_alpha",
"b2_alpha",
]
atom_dtypes = [np.uint32, complex, complex, np.float32, np.float32, np.float32]
for f, field in enumerate(atom_fieldnames):
atomsDict[field] = np.ndarray(numAtoms, dtype=atom_dtypes[f])
for n, atom in enumerate(atomsVector.data):
for field in atom_fieldnames:
atomsDict[field][n] = atom.__getattribute__(field)
atomsDict["Fa_alpha_re"] = np.float32(atomsDict["Fa_alpha"].real)
atomsDict["Fa_alpha_im"] = np.float32(atomsDict["Fa_alpha"].imag)
atomsDict["Fb_alpha_re"] = np.float32(atomsDict["Fb_alpha"].real)
atomsDict["Fb_alpha_im"] = np.float32(atomsDict["Fb_alpha"].imag)
return atomsDict
def _get_absolute_kernel_path(kernel):
pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
kernelfile = kernel + ".cu"
return os.path.join(pyfstatdir, "pyCUDAkernels", kernelfile)
def _print_GPU_memory_MB(key):
mem_used_MB = drv.mem_get_info()[0] / (2.0 ** 20)
mem_total_MB = drv.mem_get_info()[1] / (2.0 ** 20)
logging.debug(
"{} GPU memory: {:.4f} / {:.4f} MB free".format(key, mem_used_MB, mem_total_MB)
)
[docs]def pycuda_compute_transient_fstat_map(multiFstatAtoms, windowRange):
"""GPU version of computing a transient F-statistic map.
This is based on XLALComputeTransientFstatMap from LALSuite,
(C) 2009 Reinhard Prix, licensed under GPL.
The 'map' consists of F-statistics evaluated over
a range of different `(t0,tau)` pairs
(transient start-times and duration parameters).
This is a high-level wrapper function;
the actual CUDA computations are performed in one of the functions
`pycuda_compute_transient_fstat_map_rect()`
or `pycuda_compute_transient_fstat_map_exp()`,
depending on the window functon defined in `windowRange`.
Parameters
----------
multiFstatAtoms: lalpulsar.MultiFstatAtomVector
The time-dependent F-stat atoms previously computed by `ComputeFstat`.
windowRange: lalpulsar.transientWindowRange_t
The structure defining the transient parameters.
Returns
-------
FstatMap: pyTransientFstatMap
A 2D matrix `F_mn`,
with `m` an index over start-times `t0`,
and `n` an index over duration parameters `tau`,
in steps of `dt0` in `[t0, t0+t0Band]`,
and `dtau` in `[tau, tau+tauBand]`
as defined in the `windowRange` input.
"""
if windowRange.type >= lalpulsar.TRANSIENT_LAST:
raise ValueError(
"Unknown window-type ({}) passed as input."
" Allowed are [0,{}].".format(
windowRange.type, lalpulsar.TRANSIENT_LAST - 1
)
)
# internal dict for search/setup parameters
tCWparams = {}
# first combine all multi-atoms
# into a single atoms-vector with *unique* timestamps
tCWparams["TAtom"] = multiFstatAtoms.data[0].TAtom
TAtomHalf = int(tCWparams["TAtom"] / 2) # integer division
atoms = lalpulsar.mergeMultiFstatAtomsBinned(multiFstatAtoms, tCWparams["TAtom"])
# make a combined input matrix of all atoms vectors, for transfer to GPU
tCWparams["numAtoms"] = atoms.length
atomsDict = reshape_FstatAtomsVector(atoms)
atomsInputMatrix = np.column_stack(
(
atomsDict["a2_alpha"],
atomsDict["b2_alpha"],
atomsDict["ab_alpha"],
atomsDict["Fa_alpha_re"],
atomsDict["Fa_alpha_im"],
atomsDict["Fb_alpha_re"],
atomsDict["Fb_alpha_im"],
)
)
# actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
# in steps of TAtom
tCWparams["t0_data"] = int(atoms.data[0].timestamp)
tCWparams["t1_data"] = int(
atoms.data[tCWparams["numAtoms"] - 1].timestamp + tCWparams["TAtom"]
)
logging.debug(
"Transient F-stat map:"
" t0_data={:d}, t1_data={:d}".format(tCWparams["t0_data"], tCWparams["t1_data"])
)
logging.debug(
"Transient F-stat map:"
" numAtoms={:d}, TAtom={:d},"
" TAtomHalf={:d}".format(tCWparams["numAtoms"], tCWparams["TAtom"], TAtomHalf)
)
# special treatment of window_type = none
# ==> replace by rectangular window spanning all the data
if windowRange.type == lalpulsar.TRANSIENT_NONE:
windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR
windowRange.t0 = tCWparams["t0_data"]
windowRange.t0Band = 0
windowRange.dt0 = tCWparams["TAtom"] # irrelevant
windowRange.tau = tCWparams["numAtoms"] * tCWparams["TAtom"]
windowRange.tauBand = 0
windowRange.dtau = tCWparams["TAtom"] # irrelevant
""" NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
* while the indices {m,n} enumerate the full grid of values
* in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
* This allows us to deal with gaps in the data in a transparent way.
*
* NOTE2: we operate on the 'binned' atoms returned
* from XLALmergeMultiFstatAtomsBinned(),
* which means we can safely assume all atoms to be lined up
* perfectly on a 'deltaT' binned grid.
*
* The mapping used will therefore be {i,j} -> {m,n}:
* m = offs_i / deltaT
* start-time offset from t0_min measured in deltaT
* n = Tcoh_ij / deltaT
* duration Tcoh_ij measured in deltaT,
*
* where
* offs_i = t_i - t0_min
* Tcoh_ij = t_j - t_i + deltaT
*
"""
# We allocate a matrix {m x n} = t0Range * TcohRange elements
# covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
tCWparams["N_t0Range"] = int(
np.floor(1.0 * windowRange.t0Band / windowRange.dt0) + 1
)
tCWparams["N_tauRange"] = int(
np.floor(1.0 * windowRange.tauBand / windowRange.dtau) + 1
)
FstatMap = pyTransientFstatMap(tCWparams["N_t0Range"], tCWparams["N_tauRange"])
logging.debug(
"Transient F-stat map:"
" N_t0Range={:d}, N_tauRange={:d},"
" total grid points: {:d}".format(
tCWparams["N_t0Range"],
tCWparams["N_tauRange"],
tCWparams["N_t0Range"] * tCWparams["N_tauRange"],
)
)
if windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR:
FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect(
atomsInputMatrix, windowRange, tCWparams
)
elif windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL:
FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp(
atomsInputMatrix, windowRange, tCWparams
)
else:
raise ValueError(
"Invalid transient window type {}"
" not in [{}, {}].".format(
windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST - 1
)
)
# out of loop: get max2F and ML estimates over the m x n matrix
FstatMap.maxF = FstatMap.F_mn.max()
maxidx = np.unravel_index(
FstatMap.F_mn.argmax(), (tCWparams["N_t0Range"], tCWparams["N_tauRange"])
)
FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0
FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau
logging.debug(
"Done computing transient F-stat map."
" maxF={:.4f}, t0_ML={}, tau_ML={}".format(
FstatMap.maxF, FstatMap.t0_ML, FstatMap.tau_ML
)
)
return FstatMap
[docs]def pycuda_compute_transient_fstat_map_rect(atomsInputMatrix, windowRange, tCWparams):
"""GPU computation of the transient F-stat map for rectangular windows.
As discussed in Keitel & Ashton (CQG 35, 205003, 2018):
https://arxiv.org/abs/1805.05652
this version only does GPU parallization for the outer loop,
keeping the partial sums of the inner loop local to each individual kernel
using the 'memory trick'.
Parameters
----------
atomsInputMatrix: np.ndarray
A 2D array of stacked named columns containing the F-stat atoms.
windowRange: lalpulsar.transientWindowRange_t
The structure defining the transient parameters.
tCWparams: dict
A dictionary of miscellaneous parameters.
Returns
-------
F_mn: np.ndarray
A 2D array of the computed transient F-stat map over the
`[t0,tau]` range.
"""
# gpu data setup and transfer
_print_GPU_memory_MB("Initial")
input_gpu = gpuarray.to_gpu(atomsInputMatrix)
Fmn_gpu = gpuarray.GPUArray(
(tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32
)
_print_GPU_memory_MB("After input+output allocation:")
# GPU kernel
kernel = "cudaTransientFstatRectWindow"
kernelfile = _get_absolute_kernel_path(kernel)
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare("PIIIIIIIIP")
# GPU grid setup
blockRows = min(1024, tCWparams["N_t0Range"])
blockCols = 1
gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows))
gridCols = 1
# running the kernel
logging.debug(
"Calling pyCUDA kernel with a grid of {}*{}={} blocks"
" of {}*{}={} threads each: {} total threads...".format(
gridRows,
gridCols,
gridRows * gridCols,
blockRows,
blockCols,
blockRows * blockCols,
gridRows * gridCols * blockRows * blockCols,
)
)
partial_Fstat_cuda.prepared_call(
(gridRows, gridCols),
(blockRows, blockCols, 1),
input_gpu.gpudata,
tCWparams["numAtoms"],
tCWparams["TAtom"],
tCWparams["t0_data"],
windowRange.t0,
windowRange.dt0,
windowRange.tau,
windowRange.dtau,
tCWparams["N_tauRange"],
Fmn_gpu.gpudata,
)
# return results to host
F_mn = Fmn_gpu.get()
_print_GPU_memory_MB("Final")
return F_mn
[docs]def pycuda_compute_transient_fstat_map_exp(atomsInputMatrix, windowRange, tCWparams):
"""GPU computation of the transient F-stat map for exponential windows.
As discussed in Keitel & Ashton (CQG 35, 205003, 2018):
https://arxiv.org/abs/1805.05652
this version does full GPU parallization
of both the inner and outer loop.
Parameters
----------
atomsInputMatrix: np.ndarray
A 2D array of stacked named columns containing the F-stat atoms.
windowRange: lalpulsar.transientWindowRange_t
The structure defining the transient parameters.
tCWparams: dict
A dictionary of miscellaneous parameters.
Returns
-------
F_mn: np.ndarray
A 2D array of the computed transient F-stat map over the
`[t0,tau]` range.
"""
# gpu data setup and transfer
_print_GPU_memory_MB("Initial")
input_gpu = gpuarray.to_gpu(atomsInputMatrix)
Fmn_gpu = gpuarray.GPUArray(
(tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32
)
_print_GPU_memory_MB("After input+output allocation:")
# GPU kernel
kernel = "cudaTransientFstatExpWindow"
kernelfile = _get_absolute_kernel_path(kernel)
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare("PIIIIIIIIIP")
# GPU grid setup
blockRows = min(32, tCWparams["N_t0Range"])
blockCols = min(32, tCWparams["N_tauRange"])
gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows))
gridCols = int(np.ceil(1.0 * tCWparams["N_tauRange"] / blockCols))
# running the kernel
logging.debug(
"Calling kernel with a grid of {}*{}={} blocks"
" of {}*{}={} threads each: {} total threads...".format(
gridRows,
gridCols,
gridRows * gridCols,
blockRows,
blockCols,
blockRows * blockCols,
gridRows * gridCols * blockRows * blockCols,
)
)
partial_Fstat_cuda.prepared_call(
(gridRows, gridCols),
(blockRows, blockCols, 1),
input_gpu.gpudata,
tCWparams["numAtoms"],
tCWparams["TAtom"],
tCWparams["t0_data"],
windowRange.t0,
windowRange.dt0,
windowRange.tau,
windowRange.dtau,
tCWparams["N_t0Range"],
tCWparams["N_tauRange"],
Fmn_gpu.gpudata,
)
# return results to host
F_mn = Fmn_gpu.get()
_print_GPU_memory_MB("Final")
return F_mn