Skip to content

Documentation for lanctools Python API

Classes

LancData

lanctools.core.LancData

The genotype and local ancestry data for a single chromosome/dataset.

Attributes: pgen (PgenReader): A pgenlib PgenReader object. pvar: A pgenlib PVarReader object. lanc: A FlatLanc object with local ancestry data. ancestries: An ordered list of ancestry names. The integer codes in the .lanc file and self.lanc correspond to indices in this list (e.g. 0 -> ancestries[0]). plink_prefix: The prefix for the corresponding plink2 fileset.

Source code in src/lanctools/core.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
class LancData:
    """The genotype and local ancestry data for a single chromosome/dataset.

    Attributes:
        pgen (PgenReader): A pgenlib PgenReader object.
        pvar: A pgenlib PVarReader object.
        lanc: A FlatLanc object with local ancestry data.
        ancestries: An ordered list of ancestry names. The integer codes in
            the .lanc file and `self.lanc` correspond to indices in this list (e.g.
            0 -> ancestries[0]).
        plink_prefix: The prefix for the corresponding plink2 fileset.
    """

    def __init__(
        self,
        plink_prefix: str,
        lanc_file: str,
        ancestries: Optional[list[str]] = None,
    ):
        """Constructs a LancData from plink2 files.

        Args:
            plink_prefix: A string with the prefix for a plink2 fileset.
            lanc_file: A string with the path to a .lanc file.
            ancestries: An optional list of ordered ancestry names corresponding to the .lanc file.
        """
        pgen = PgenReader(bytes(plink_prefix + ".pgen", "utf8"))
        pvar = PvarReader(bytes(plink_prefix + ".pvar", "utf8"))
        lanc = _read_lanc(lanc_file)

        if ancestries is None:
            all_values = np.concatenate([lanc.left_haps, lanc.right_haps])
            ancestries = [str(i) for i in np.unique(all_values)]

        self.pgen = pgen
        self.pvar = pvar
        self.lanc = lanc
        self.ancestries = ancestries
        self.plink_prefix = plink_prefix

    def get_info(self, indices: NDArray[np.uint32]) -> DataFrame:
        """Query info for a set of variants.

        Args:
            indices: Array of variant indices in pvar order (0-based), shape
                ``(V,)``, dtype ``int32``.

        Returns:
            A pandas ``DataFrame`` with one row per variant and the following columns:
            - ``chrom`` (str): Chromosome name
            - ``pos`` (uint32): 1-based genomic position
            - ``ref`` (str): Reference allele
            - ``alt`` (str): Alternate allele
            - ``rsid`` (str): Variant identifier
        """

        return _get_info(self.pvar, indices)

    def get_lanc(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.uint8]:
        """Query phased local ancestry.

        :param indices: Array of variant indices in pvar order (0-based), shape
            ``(V,)``, dtype ``int32``.
        :type indices: numpy.ndarray
        :return: Local ancestries, shape ``(N, V, 2)``, dtype ``uint8``
        :rtype: numpy.ndarray
        """

        left, right = _get_lanc(
            self.lanc.left_haps,
            self.lanc.right_haps,
            self.lanc.breakpoints,
            self.lanc.offsets,
            indices,
        )
        return np.stack((left, right), axis=-1)

    def get_lanc_dosage(self, indices: NDArray[np.uint32]) -> NDArray[np.uint8]:
        """Query local ancestry dosage.

        :param indices: Array of variant indices in pvar order (0-based), shape
            ``(V,)``, dtype ``int32``.
        :type indices: numpy.ndarray
        :return: Local ancestry dosages, shape ``(N, V, len(self.ancestries))``, dtype ``uint8``.
        :rtype: numpy.ndarray
        """

        lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
        ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
        left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
        right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
            np.int32
        )
        return left_haps_mask + right_haps_mask

    def get_geno(self, indices: NDArray[np.uint32]) -> NDArray[np.int32]:
        """Query phased genotypes.

        :param indices: Array of variant indices in pvar order (0-based), shape
            ``(V,)``, dtype ``int32``.
        :type indices: numpy.ndarray
        :return: Phased genotypes, shape ``(N, V, 2)``, dtype ``int32``.
        :rtype: numpy.ndarray
        """

        return _get_geno(self.pgen, indices)

    def get_lanc_geno(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.int32]:
        """Query genotypes deconvoluted/masked by ancestry.

        :param indices: Array of variant indices in pvar order (0-based), shape
            ``(V,)``, dtype ``int32``.
        :type indices: numpy.ndarray
        :return: Genotypes masked by ancestry, shape ``(N, V, len(self.ancestries))``, dtype ``int32``.
        :rtype: numpy.ndarray
        """
        geno = np.asarray(self.get_geno(indices), dtype=np.int32)
        lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
        ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
        left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
        right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
            np.int32
        )
        geno_masked = (
            left_haps_mask * geno[:, :, 0:1] + right_haps_mask * geno[:, :, 1:2]
        )
        return geno_masked
__init__(plink_prefix, lanc_file, ancestries=None)

Constructs a LancData from plink2 files.

Args: plink_prefix: A string with the prefix for a plink2 fileset. lanc_file: A string with the path to a .lanc file. ancestries: An optional list of ordered ancestry names corresponding to the .lanc file.

Source code in src/lanctools/core.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def __init__(
    self,
    plink_prefix: str,
    lanc_file: str,
    ancestries: Optional[list[str]] = None,
):
    """Constructs a LancData from plink2 files.

    Args:
        plink_prefix: A string with the prefix for a plink2 fileset.
        lanc_file: A string with the path to a .lanc file.
        ancestries: An optional list of ordered ancestry names corresponding to the .lanc file.
    """
    pgen = PgenReader(bytes(plink_prefix + ".pgen", "utf8"))
    pvar = PvarReader(bytes(plink_prefix + ".pvar", "utf8"))
    lanc = _read_lanc(lanc_file)

    if ancestries is None:
        all_values = np.concatenate([lanc.left_haps, lanc.right_haps])
        ancestries = [str(i) for i in np.unique(all_values)]

    self.pgen = pgen
    self.pvar = pvar
    self.lanc = lanc
    self.ancestries = ancestries
    self.plink_prefix = plink_prefix
get_geno(indices)

Query phased genotypes.

Parameters:

Name Type Description Default
indices NDArray[uint32]

Array of variant indices in pvar order (0-based), shape (V,), dtype int32.

required

Returns:

Type Description
numpy.ndarray

Phased genotypes, shape (N, V, 2), dtype int32.

Source code in src/lanctools/core.py
368
369
370
371
372
373
374
375
376
377
378
def get_geno(self, indices: NDArray[np.uint32]) -> NDArray[np.int32]:
    """Query phased genotypes.

    :param indices: Array of variant indices in pvar order (0-based), shape
        ``(V,)``, dtype ``int32``.
    :type indices: numpy.ndarray
    :return: Phased genotypes, shape ``(N, V, 2)``, dtype ``int32``.
    :rtype: numpy.ndarray
    """

    return _get_geno(self.pgen, indices)
get_info(indices)

Query info for a set of variants.

Args: indices: Array of variant indices in pvar order (0-based), shape (V,), dtype int32.

Returns: A pandas DataFrame with one row per variant and the following columns: - chrom (str): Chromosome name - pos (uint32): 1-based genomic position - ref (str): Reference allele - alt (str): Alternate allele - rsid (str): Variant identifier

Source code in src/lanctools/core.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def get_info(self, indices: NDArray[np.uint32]) -> DataFrame:
    """Query info for a set of variants.

    Args:
        indices: Array of variant indices in pvar order (0-based), shape
            ``(V,)``, dtype ``int32``.

    Returns:
        A pandas ``DataFrame`` with one row per variant and the following columns:
        - ``chrom`` (str): Chromosome name
        - ``pos`` (uint32): 1-based genomic position
        - ``ref`` (str): Reference allele
        - ``alt`` (str): Alternate allele
        - ``rsid`` (str): Variant identifier
    """

    return _get_info(self.pvar, indices)
get_lanc(indices)

Query phased local ancestry.

Parameters:

Name Type Description Default
indices NDArray[unsignedinteger]

Array of variant indices in pvar order (0-based), shape (V,), dtype int32.

required

Returns:

Type Description
numpy.ndarray

Local ancestries, shape (N, V, 2), dtype uint8

Source code in src/lanctools/core.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
def get_lanc(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.uint8]:
    """Query phased local ancestry.

    :param indices: Array of variant indices in pvar order (0-based), shape
        ``(V,)``, dtype ``int32``.
    :type indices: numpy.ndarray
    :return: Local ancestries, shape ``(N, V, 2)``, dtype ``uint8``
    :rtype: numpy.ndarray
    """

    left, right = _get_lanc(
        self.lanc.left_haps,
        self.lanc.right_haps,
        self.lanc.breakpoints,
        self.lanc.offsets,
        indices,
    )
    return np.stack((left, right), axis=-1)
get_lanc_dosage(indices)

Query local ancestry dosage.

Parameters:

Name Type Description Default
indices NDArray[uint32]

Array of variant indices in pvar order (0-based), shape (V,), dtype int32.

required

Returns:

Type Description
numpy.ndarray

Local ancestry dosages, shape (N, V, len(self.ancestries)), dtype uint8.

Source code in src/lanctools/core.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
def get_lanc_dosage(self, indices: NDArray[np.uint32]) -> NDArray[np.uint8]:
    """Query local ancestry dosage.

    :param indices: Array of variant indices in pvar order (0-based), shape
        ``(V,)``, dtype ``int32``.
    :type indices: numpy.ndarray
    :return: Local ancestry dosages, shape ``(N, V, len(self.ancestries))``, dtype ``uint8``.
    :rtype: numpy.ndarray
    """

    lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
    ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
    left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
    right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
        np.int32
    )
    return left_haps_mask + right_haps_mask
get_lanc_geno(indices)

Query genotypes deconvoluted/masked by ancestry.

Parameters:

Name Type Description Default
indices NDArray[unsignedinteger]

Array of variant indices in pvar order (0-based), shape (V,), dtype int32.

required

Returns:

Type Description
numpy.ndarray

Genotypes masked by ancestry, shape (N, V, len(self.ancestries)), dtype int32.

Source code in src/lanctools/core.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
def get_lanc_geno(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.int32]:
    """Query genotypes deconvoluted/masked by ancestry.

    :param indices: Array of variant indices in pvar order (0-based), shape
        ``(V,)``, dtype ``int32``.
    :type indices: numpy.ndarray
    :return: Genotypes masked by ancestry, shape ``(N, V, len(self.ancestries))``, dtype ``int32``.
    :rtype: numpy.ndarray
    """
    geno = np.asarray(self.get_geno(indices), dtype=np.int32)
    lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
    ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
    left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
    right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
        np.int32
    )
    geno_masked = (
        left_haps_mask * geno[:, :, 0:1] + right_haps_mask * geno[:, :, 1:2]
    )
    return geno_masked