Source code for guido.genome

import pickle
import subprocess
from pathlib import Path

from pyfaidx import Faidx, Fasta


[docs] def check_file(filename, supported_ext): """Check if file exists and has the right extension. Parameters ---------- filename : str Path to the file. supported_ext : str Supported file extensions. Returns ------- bool True if file exists and has the right extension, False otherwise. """ filename = Path(filename) if filename.exists() and filename.suffix in supported_ext: return True else: return False
[docs] class Genome: def __init__( self, genome_name, genome_file_abspath=None, annotation_file_abspath=None, bowtie_index_abspath=None, ): """Genome class. Parameters ---------- genome_name : str Name of the genome. genome_file_abspath : str, optional Path to the genome Fasta file, by default None annotation_file_abspath : str, optional Path to the genome annotation file (GTF or GFF3), by default None bowtie_index_abspath : str, optional Path to the bowtie index files, by default None """ self.genome_name = genome_name self._bowtie_ignore = False self.bowtie_index = None if genome_file_abspath: if check_file(genome_file_abspath, [".fa", ".fasta", ".fna"]): self.genome_file_abspath = Path(genome_file_abspath).resolve() else: raise ValueError( f"Genome file {genome_file_abspath} does noth exist or is not in the right format." ) if bowtie_index_abspath: self.bowtie_index = Path(bowtie_index_abspath) self._bowtie_ignore = True if annotation_file_abspath: if check_file(annotation_file_abspath, [".gtf", ".gff3"]): self.annotation_file_abspath = Path(annotation_file_abspath).resolve() else: raise ValueError( f"Annotation file file {genome_file_abspath} does noth exist or is not in the right format." ) else: self.annotation_file_abspath = None @property
[docs] def is_built(self): """Check if genome index is built. Returns ------- Bool True if genome index is built, False otherwise. """ ready = [] if self.genome_file_abspath and self.fai_file: ready.append(True) else: return False if self.annotation_file_abspath and not self.tbi_file: ready.append(False) else: ready.append(True) bowtie_index_files = [ i for i in self.genome_file_abspath.parent.glob("*ebwt") if f"{self.genome_name}.rev." in str(i) and f"{self.genome_name}." in str(i) ] if not self._bowtie_ignore and any(bowtie_index_files): ready.append(True) elif not self._bowtie_ignore and not any(bowtie_index_files): ready.append(False) return all(ready)
[docs] def build( self, n_threads=1, save_pickle=True, bowtie_ignore=False, bowtie_path="", ): """Build genome index. This method creates index files for the genome and annotation files and saves them in a `.guido` file in the same directory as the genome Fasta file. This file can be later loaded using :meth:`guido.genome.load_genome_from_file` without having to re-build the index. Parameters ---------- n_threads : int, optional Number of threads, by default 1 save_pickle : bool, optional Pickle the dictionary into `.guido` file, by default True bowtie_ignore : bool, optional Ignore building bowtie index. Use if you already have bowtie index built, by default False bowtie_path : str, optional Path to bowtie binary if it's not in the path, by default "" """ self._bowtie_ignore = bowtie_ignore if self.annotation_file_abspath: print("Indexing genome annotation.") annotation_sorted_bgz = self.annotation_file_abspath.with_suffix( str(self.annotation_file_abspath.suffix) + ".gz" ) annotation_sort_bgz_cmd = f"sort -k1,1 -k4,4n {str(self.annotation_file_abspath)} | bgzip > {annotation_sorted_bgz}" self.annotation_sorted_gz_file = annotation_sorted_bgz _ = subprocess.run(annotation_sort_bgz_cmd, shell=True) tabix_cmd = f"tabix {str(self.annotation_sorted_gz_file)}" _ = subprocess.run(tabix_cmd, stderr=subprocess.PIPE, shell=True) self.tbi_file = self.annotation_sorted_gz_file.with_suffix(".tbi") if self.genome_file_abspath.exists(): Faidx(str(self.genome_file_abspath)) self.fai_file = self.genome_file_abspath.with_suffix( str(self.genome_file_abspath.suffix) + ".fai" ) if not self._bowtie_ignore: bowtie_index_command = f"{bowtie_path}bowtie-build {str(self.genome_file_abspath)} {str(self.genome_file_abspath.parent / self.genome_name)} --threads {n_threads}" print("Building Bowtie index") bowtie_index_proc = subprocess.run( bowtie_index_command.strip(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) if bowtie_index_proc.stderr: print(bowtie_index_proc.stderr) else: self.bowtie_index = Path( f"{str(self.genome_file_abspath.parent / self.genome_name)}" ) print(f"Done: {self.bowtie_index }") if save_pickle: guido_file = Path( f"{str(self.genome_file_abspath.parent / self.genome_name)}.guido" ) with open(guido_file, "wb") as f: pickle.dump(self.__dict__, f, protocol=pickle.HIGHEST_PROTOCOL) print( f"{self.genome_name} genome data can now be used by Guido: {str(guido_file)}" )
@property
[docs] def sequence(self): if self.is_built: return Fasta(str(self.genome_file_abspath))
[docs] def load_genome_from_file(guido_file): """Load a genome from a .guido file. This file is created when a genome is built. It contains all the information needed to use the genome. Guido files are saved in the same directory as the genome FASTA file. They are named after the genome name and have the .guido extension. Genome object can be created by using the :method:`build` method. Parameters ---------- guido_file : str Path to the .guido file. Returns ------- Genome :class:`Genome` object. """ if Path(guido_file).exists(): with open(guido_file, "rb") as f: guido_dict = pickle.load(f) G = Genome(genome_name=guido_dict["genome_name"]) for key, val in guido_dict.items(): setattr(G, key, val) return G else: raise ValueError(f"Provided path to .guido ({guido_file}) file does not exist.")