Source code for guido.locus

import re

import allel
import mcdm
import numpy as np
import pandas as pd
import pyranges
from pyfaidx import Fasta, Sequence

from .guides import Guide
from .helpers import (
    _guides_detailed_table,
    _guides_to_bed,
    _guides_to_csv,
    _guides_to_dataframe,
    load_cfd_scoring_matrix,
)
from .off_targets import (
    calculate_cfd_score,
    calculate_ot_sum_score,
    get_off_targets_string,
    run_bowtie,
)


[docs] class Locus: def __init__( self, sequence, name=None, start=1, end=None, genome=None, annotation=None, **kwargs, ): """Locus object that represents a genomic locus in which gRNAs are searched. Parameters ---------- sequence : Sequence, str Nucleotide sequence. Can be defined as `pyfaidx.Sequence` or `str`. name : str, optional Name of the sequence, contig or chromosome, by default None. start : int, optional Starting position of the locus. It should be 1-based. By default 1. end : int, optional Ending position of the locus. genome : Genome, optional Genome object that also includes the genomic locus defined here. It is used by default to search for gRNA off-targets. annotation : pd.DataFrame, optional Annotation table that provides genomic features and is used to annotate gRNAs that are found in the locus. Examples ---------- >>> import guido >>> seq = "TTATCATCCACTCTGACGGGTGGTATTGCGCAACTCCACGCCATCAAACATGTTCAGATTATGCAATCGTGAGTATTCGTTGACCACCGCTTGACCTGTGT" >>> loc = guido.Locus( ... sequence=seq, name="AgamP4_2R", start=48714554, end=48714654 ... ) >>> loc.find_guides() >>> loc.guides [gRNA-1(CGCAATACCACCCGTCAGAGTGG|AgamP4_2R:48714561-48714584|-|), gRNA-2(TTATCATCCACTCTGACGGGTGG|AgamP4_2R:48714554-48714577|+|), gRNA-3(TCTGAACATGTTTGATGGCGTGG|AgamP4_2R:48714589-48714612|-|), gRNA-4(CATAATCTGAACATGTTTGATGG|AgamP4_2R:48714594-48714617|-|)] """ self.sequence = sequence self.chromosome = name self.start = start self.intervals = [] self.genome = genome self.annotation = annotation self.pam = "" self.guides = [] self._layers = {} if isinstance(sequence, Sequence): self.sequence = sequence self.start = sequence.start self.end = sequence.end self.chromosome = sequence.name else: self.sequence = Sequence( seq=sequence, start=start, end=end, name=name, **kwargs ) if end and isinstance(end, int): self.end = end elif not end: self.end = self.start + len(self.sequence.seq) self.length = len(self.sequence)
[docs] def __repr__(self): """Returns a string representation of the locus object.""" return f"Locus({self.chromosome}:{self.start}-{self.end})"
[docs] def to_dict(self): """Converts the locus object to a dictionary.""" return self.__dict__
[docs] def guide(self, ix): """Fetch a guide from the locus by its index or name. Parameters ---------- ix : str or int Index of the gRNA. Returns ------- g: Guide Guide object representing a gRNA Examples -------- >>> import guido >>> seq = "TTATCATCCACTCTGACGGGTGGTATTGCGCAACTCCACGCCATCAAACATGTTCAGATTATGCAATCGTGAGTATTCGTTGACCACCGCTTGACCTGTGT" >>> loc = guido.Locus( ... sequence=seq, name="AgamP4_2R", start=48714554, end=48714654 ... ) >>> loc.find_guides() >>> loc.guide("gRNA-1") gRNA-1(CGCAATACCACCCGTCAGAGTGG|AgamP4_2R:48714561-48714584|-|) >>> loc.guide(0) gRNA-1(CGCAATACCACCCGTCAGAGTGG|AgamP4_2R:48714561-48714584|-|) """ if isinstance(ix, str): for g in self.guides: if g.id == ix: return g else: raise ValueError("Provided gRNA index is not valid.") elif isinstance(ix, int): return self.guides[ix] else: raise ValueError("Provided gRNA index is not valid.")
[docs] def _flatten_intervals(self, intervals): """Flattens overlapping intervals into an union.""" fi = [] for start, end in intervals: if fi and fi[-1][1] >= start - 1: fi[-1][1] = max(fi[-1][1], end) else: if start < self.start: start = self.start if end > self.end: end = self.end fi.append([start, end]) return fi
[docs] def _find_guides_in_interval(self, sequence, start, pam): """Finds guides in interval.""" iupac_dict = { "A": ("A", "T"), "C": ("C", "G"), "G": ("G", "C"), "T": ("T", "A"), "R": ("[AG]", "[CT]"), "Y": ("[CT]", "[AG]"), "S": ("[GC]", "[GC]"), "W": ("[AT]", "[AT]"), "K": ("[GT]", "[AC]"), "M": ("[AC]", "[GT]"), "B": ("[CGT]", "[ACG]"), "D": ("[AGT]", "[ACT]"), "H": ("[ACT]", "[AGT]"), "V": ("[ACG]", "[CGT]"), "N": ("[ACGT]", "[ACGT]"), } iupac_pam = "".join([iupac_dict[letter][0] for letter in pam]) rev_iupac_pam = "".join([iupac_dict[letter][1] for letter in pam[::-1]]) fwd_seq = sequence.upper() pams_fw = [m.start() for m in re.finditer(rf"(?=({iupac_pam}))", fwd_seq)] pams_rv = [m.start() for m in re.finditer(rf"(?=({rev_iupac_pam}))", fwd_seq)] guides_fw = [ Guide( sequence=fwd_seq, pam_position=pam_position, pam_len=len(pam), strand="+", chromosome=self.chromosome, start=start, ) for pam_position in pams_fw ] guides_rv = [ Guide( sequence=fwd_seq, pam_position=pam_position, pam_len=len(pam), strand="-", chromosome=self.chromosome, start=start, ) for pam_position in pams_rv ] guides = [ guide for guide in guides_fw + guides_rv if "N" not in guide.guide_seq ] return guides
[docs] def find_guides( self, pam="NGG", min_flanking_length=0, selected_features="all", ): """Find gRNAs in the locus. Parameters ---------- pam : str, optional gRNA PAM sequence, by default "NGG" min_flanking_length : int, optional Defines flanking region from the locus where gRNAs are ignored. By default 0, however `simulate_end_joining()` requires flanking region of 75 bp to simulate MMEJ. selected_features : str, optional Limit gRNA search on only specified genomic features. Features are defined in the provided genome annotation file. By default {"all"} Returns ------- sorted_guides : list List of gRNAs sorted by their position in the locus. Examples -------- >>> import guido >>> genome = guido.load_genome_from_file( ... guido_file="/Users/nkranjc/imperial/ref/new/AgamP4.guido" ... ) >>> loc = guido.locus_from_coordinates(genome, "AgamP4_2R", 48714541, 48714666) >>> loc.find_guides() >>> loc.guides [gRNA-1(AAGTTTATCATCCACTCTGACGG|AgamP4_2R:48714550-48714572|+|), gRNA-2(CGCAATACCACCCGTCAGAGTGG|AgamP4_2R:48714561-48714583|-|), gRNA-3(AGTTTATCATCCACTCTGACGGG|AgamP4_2R:48714551-48714573|+|), gRNA-4(TTATCATCCACTCTGACGGGTGG|AgamP4_2R:48714554-48714576|+|), gRNA-5(TCTGAACATGTTTGATGGCGTGG|AgamP4_2R:48714589-48714611|-|), gRNA-6(CATAATCTGAACATGTTTGATGG|AgamP4_2R:48714594-48714616|-|), gRNA-7(GTTTAACACAGGTCAAGCGGTGG|AgamP4_2R:48714637-48714659|-|), gRNA-8(TATGTTTAACACAGGTCAAGCGG|AgamP4_2R:48714640-48714662|-|)] Searching for gRNAs in a specific genomic feature: >>> loc.find_guides(selected_features="exon") >>> loc.guides [gRNA-1(AAGTTTATCATCCACTCTGACGG|AgamP4_2R:48714550-48714572|+|), gRNA-2(CGCAATACCACCCGTCAGAGTGG|AgamP4_2R:48714561-48714583|-|), gRNA-3(AGTTTATCATCCACTCTGACGGG|AgamP4_2R:48714551-48714573|+|), gRNA-4(TTATCATCCACTCTGACGGGTGG|AgamP4_2R:48714554-48714576|+|), gRNA-5(TCTGAACATGTTTGATGGCGTGG|AgamP4_2R:48714589-48714611|-|), gRNA-6(CATAATCTGAACATGTTTGATGG|AgamP4_2R:48714594-48714616|-|)] """ # save searched PAM self.pam = pam # reset back to empty self.guides = [] # take default locus bounds self.intervals = [[self.start, self.end]] if "all" not in selected_features and isinstance(self.annotation, pd.DataFrame): locus_annotation = self.annotation.query( "(Feature == @selected_features) & \ (Chromosome == @self.chromosome) & \ (((Start >= @self.start) & (Start <= @self.end)) | \ ((End >= @self.start) & (End <= @self.end)))" ).sort_values("Start") # split the locus into smaller loci defined by features if len(locus_annotation) > 0: self.intervals = self._flatten_intervals( [ [f_start, f_end] for f_start, f_end in locus_annotation[["Start", "End"]].values ] ) # search for guides in each locus for interval_start, interval_end in self.intervals: # check if there is sequence 30 bp upstrean rel_interval_start = interval_start - min_flanking_length - self.start rel_interval_end = interval_end + min_flanking_length - self.start if rel_interval_start < 0: rel_interval_start = 0 if rel_interval_end > len(self.sequence.seq): rel_interval_end = len(self.sequence.seq) locus_sequence = self.sequence.seq[rel_interval_start:rel_interval_end] locus_guides = self._find_guides_in_interval( locus_sequence, interval_start - min_flanking_length, pam ) # dont add shorter sequences, check that cut position is in the interval locus_guides_filtered = [] for ix, g in enumerate(locus_guides): if ( len(g.guide_seq) == 23 and interval_start <= g.absolute_cut_pos <= interval_end ): locus_guides_filtered.append(g) self.guides.extend(locus_guides_filtered) # sort guides by cut position sorted_guides = [] for ix, g in enumerate(sorted(self.guides, key=lambda g: g.absolute_cut_pos)): g.id = f"gRNA-{ix + 1}" sorted_guides.append(g) self.guides = sorted_guides return sorted_guides
[docs] def simulate_end_joining(self, n_patterns=5, length_weight=20): """Simulate end-joining and find MMEJ deletion patterns for each gRNA. Microhomology scores are calculated based on proposed scoring model described by Bae et al. 2014. Parameters ---------- n_patterns : int, optional Number of top scored MMEJ deletion patterns reported. By default 5. length_weight : int, optional Length weight parameter used in MMEJ scoring as defined by Bae et al. 2015. By default, 20. """ if len(self.guides) == 0: raise ValueError("No gRNAs saved yet.") for G in self.guides: G.simulate_end_joining(n_patterns, length_weight)
[docs] def find_off_targets(self, external_genome=None, **kwargs): """Find off-targets in the genome for each gRNA. Parameters ---------- external_genome : Genome, optional If provided, off-target search is performed in the external genome rather than in the genome which Locus is a part of. By default None. """ # TODO refactor - the same is used in Guide class if external_genome: index_path = external_genome.bowtie_index elif self.genome: index_path = self.genome.bowtie_index else: raise ValueError("No genome / locus specified.") if index_path: guides_bowtie_offtargets = run_bowtie( guides=self.guides, pam=self.pam, genome_index_path=index_path, **kwargs ) mm_scores, pam_scores = load_cfd_scoring_matrix() for ix, G in enumerate(self.guides): if ix in guides_bowtie_offtargets.keys(): G.off_targets = guides_bowtie_offtargets[ix] G.add_layer("ot_sum_score", calculate_ot_sum_score(G.off_targets)) G.off_target_str = get_off_targets_string(G.off_targets) # Calculate CFD scores cfd_scores = calculate_cfd_score( G, G.off_targets, mm_scores, pam_scores ) for ix, cfd in enumerate(cfd_scores.tolist()): G.off_targets[ix]["cfd_score"] = cfd # scores are empty if there are no off-targets, set nan if len(cfd_scores) == 0: G.add_layer("ot_cfd_score_mean", np.nan) G.add_layer("ot_cfd_score_max", np.nan) G.add_layer("ot_cfd_score_sum", np.nan) else: G.add_layer("ot_cfd_score_mean", cfd_scores.mean()) G.add_layer("ot_cfd_score_max", cfd_scores.max()) G.add_layer("ot_cfd_score_sum", cfd_scores.sum()) else: raise ValueError("Bowtie index is not built for the genome / locus.") return guides_bowtie_offtargets
[docs] def _apply_clipped_layer_data(self, guides, layer_name, layer_data): """Apply layer data to guides.""" if len(guides) > 0: for g in self.guides: ix = g.guide_start - self.start g.add_layer(layer_name, layer_data[ix : ix + 23])
# TODO: move to util functions
[docs] def _get_guide_regions(self, guide): if guide.guide_strand == "-": pam_pos = (guide.guide_start, guide.guide_start + 1) seed_pos = (guide.guide_start + 3, guide.guide_start + 13) seed_small_pos = (guide.guide_start + 3, guide.guide_start + 7) else: pam_pos = (guide.guide_end - 1, guide.guide_end) seed_pos = (guide.guide_end - 13, guide.guide_end - 3) seed_small_pos = (guide.guide_end - 7, guide.guide_end - 3) guide_pos = (guide.guide_start, guide.guide_end) guide_regions = [pam_pos, seed_small_pos, seed_pos, guide_pos] return guide_regions
# TODO: move to util functions
[docs] def _guide_sequence_diversity(self, guide, g, pos): """Calculate sequence diversity for each region of the guide.""" guide_regions = self._get_guide_regions(guide) regions_vals = [] for r in guide_regions: try: region_loc = pos.locate_range(r[0], r[1]) region_pos = pos[region_loc] region_ac = g[region_loc].count_alleles() pi = allel.sequence_diversity(region_pos, region_ac) regions_vals.append(pi) except Exception: regions_vals.append(0) return regions_vals
[docs] def _guide_alt_ac(self, guide, g, pos): """Calculate alternative allele count for each region of the guide.""" guide_regions = self._get_guide_regions(guide) regions_vals = [] for r in guide_regions: try: region_loc = pos.locate_range(r[0], r[1]) regions_vals.append(g[region_loc].count_alleles()[:, 1:].sum()) except Exception: regions_vals.append(0) return regions_vals
[docs] def _guide_n_variants(self, guide, g, pos): """Calculate number of variants for each region of the guide.""" guide_regions = self._get_guide_regions(guide) regions_vals = [] for r in guide_regions: try: region_loc = pos.locate_range(r[0], r[1]) regions_vals.append(g[region_loc].n_variants) except Exception: regions_vals.append(0) return regions_vals
[docs] def _apply_variation_layer_data( self, guides, layer_name, layer_genotype_data, layer_pos ): """Apply sequence diversity, alternative allele count and number of variants as layers.""" guide_regions = ["pam", "seed", "small_seed", "guide"] if len(guides) > 0: for g in self.guides: guide_sequence_diversity = self._guide_sequence_diversity( g, layer_genotype_data, layer_pos ) guide_alt_ac = self._guide_alt_ac(g, layer_genotype_data, layer_pos) guide_n_variants = self._guide_n_variants( g, layer_genotype_data, layer_pos ) for i in range(len(guide_regions)): g.add_layer( f"var_{layer_name}_{guide_regions[i]}_pi", guide_sequence_diversity[i], ) g.add_layer( f"var_{layer_name}_{guide_regions[i]}_alt_ac", guide_alt_ac[i] ) g.add_layer( f"var_{layer_name}_{guide_regions[i]}_variants", guide_n_variants[i], )
@property
[docs] def layers(self): """Layers of the locus. Returns ------- Layers List of layers """ return self._layers
[docs] def add_layer( self, name, layer_data, layer_pos=None, apply_to_guides=True, is_variation=False, ): """Adds a layer with the data to the locus. Parameters ---------- name : str Name of the layer layer_data : np.ndarray Layer data. Needs to be the same shape as the locus. apply_to_guides : bool, optional Apply layer data to gRNAs when adding it to the locus. By default True. Examples -------- >>> locus = Locus("chr1", 100, 200) >>> layer_data = np.random.rand(100) >>> locus.add_layer("random", layer_data) """ # TODO: make it more intuitive self._layers[name] = layer_data if apply_to_guides: if len(self.guides) > 0: if is_variation and layer_pos: self._apply_variation_layer_data( self.guides, name, layer_data, layer_pos ) else: if layer_data.shape[0] == self.length: self._apply_clipped_layer_data(self.guides, name, layer_data) else: raise ValueError( f"Layer and locus not the same lenght ({layer_data.shape[0]}, {self.length})." ) else: raise ValueError("No guides to apply the data to.")
[docs] def _guide_layers(self): """Returns a list of all the layers that are present in the gRNAs.""" layers = [] for g in self.guides: layers.extend(g.layers.keys()) return set(layers)
[docs] def _prepare_alt_matrix(self, rank_layer_names, method=np.mean): """Prepares numerical matrix with the gRNA layer data to be used later in the ranking. Parameters ---------- rank_layer_names : list List of layer names to be used in the ranking. method : [type], optional Method to use to combine the layer data, by default np.mean Returns ------- np.ndarray Matrix with the layer data for each gRNA. """ locus_data = [] for g in self.guides: guide_data = [] for layer_name in rank_layer_names: if ( layer_name not in g.layers.keys() and layer_name not in self.layers.keys() ): print(g, layer_name, self.layers.keys(), g.layers.keys()) raise ValueError( f"Layer {layer_name} does not exist on `Locus` or `Guide` object." ) if ( layer_name not in g.layers.keys() and layer_name in self.layers.keys() ): self._apply_clipped_layer_data( [g], layer_name=layer_name, layer_data=self._layers[layer_name] ) layer_data = g._layers[layer_name] if ( isinstance(layer_data, np.ndarray) and layer_data.squeeze().ndim == 1 ): layer_data = method(layer_data) elif isinstance(layer_data, float) or isinstance(layer_data, int): layer_data = float(layer_data) elif isinstance(layer_data, (pd.Series, list)): layer_data = method(np.array(layer_data)) else: raise TypeError("Type of layer data is not valid.") guide_data.append(layer_data) locus_data.append(guide_data) return np.array(locus_data)
[docs] def rank_guides( self, layer_names=None, layer_is_benefit=None, weight_vector=None, ranking_method="TOPSIS", norm_method="Vector", ): """Ranks guides based on the layer data. Returns ------- list List of ranked guides. """ if len(self.guides) == 0: raise ValueError( "No gRNAs to rank. Try running `find_guides()` method first." ) # get layer names registered on individual guides guide_layer_names = sorted( list( set( [ guide_layer_name for g in self.guides for guide_layer_name in g.layers.keys() ] ) ) ) # check if requested layer exists on guides or on locus if layer_names: for layer in guide_layer_names: if layer not in self._layers.keys() and layer not in guide_layer_names: raise ValueError( f"Layer {layer} is not added to `Locus` or `Guide`." ) else: layer_names = list(guide_layer_names) + list(self._layers.keys()) x_matrix = self._prepare_alt_matrix(rank_layer_names=layer_names) # handle edge values x_matrix[np.isnan(x_matrix)] = 0 x_matrix[x_matrix < 0] = 0 rank_scores = mcdm.rank( x_matrix, n_method=norm_method, w_vector=weight_vector, is_benefit_x=layer_is_benefit, s_method=ranking_method, alt_names=[g.id for g in self.guides], ) # order ranks from best to worst rank_asc = np.argsort([-r[1] for r in rank_scores]) for i, (g_id, rank_score) in enumerate(rank_scores): self.guide(g_id).rank_score = rank_score self.guide(g_id).rank = rank_asc[i] + 1 return rank_scores
[docs] def guides_to_dataframe(self): """Returns gRNAs in Pandas dataframe.""" return _guides_to_dataframe(self.guides)
[docs] def guides_to_csv(self, filename): """Save gRNAs in CSV file.""" if filename: return _guides_to_csv(self.guides, filename) else: raise ValueError("Filename required to save CSV with gRNAs.")
# TODO: bed is 0-based, but gRNAs are 1-based
[docs] def guides_to_bed(self, filename): """Save gRNAs in BED file.""" if filename: return _guides_to_bed(self.guides, filename) else: raise ValueError("Filename required to save BED with gRNAs.")
[docs] def guides_detailed_table(self, filename): """Save gRNAs in a detailed text file.""" if filename: return _guides_detailed_table(self.guides, filename) else: raise ValueError("Filename required to save CSV with gRNAs.")
# TODO: plot the locus
[docs] def add_azimuth_score(self): """Apply Azimuth score to a list of guides. Azimuth is a machine learning-based predictive modelling of CRISPR/Cas9 guide efficiency. Sometimes its reffered to as Doench 2016 score. Described in https://doi.org/10.1038/nbt.3437 (Doench et al., 2016) """ return [g.add_azimuth_score() for g in self.guides]
""" Locus creation ------------------ """
[docs] def _prepare_annotation(annotation_file_abspath, as_df=True): """Prepare annotation file for use with pyranges.""" if annotation_file_abspath.suffix in [".gff3", ".gff"]: ann_db = pyranges.read_gff3(str(annotation_file_abspath), as_df=as_df) if as_df: ann_db = ann_db.rename(columns={"Name": "Exon"}) # type: ignore elif annotation_file_abspath.suffix in [".gtf"]: ann_db = pyranges.read_gtf(str(annotation_file_abspath), as_df=as_df) if as_df: ann_db = ann_db.rename(columns={"gene_id": "ID", "exon_number": "Exon"}) # type: ignore else: raise ValueError( "Annotation file not recognised. Annotation file needs to be GFF3 or GTF formnat." ) return ann_db
[docs] def locus_from_coordinates(genome, chromosome, start, end): """Create a locus from coordinates. Coordinates are 1-based. If annotation file is provided, it will be used to annotate the locus. Parameters ---------- genome : Genome Genome object. Can be created using `Genome` class. chromosome : str Chromosome name. start : int Start position. end : int End position. Returns ------- Locus Locus object. """ locus_sequence = Fasta(str(genome.genome_file_abspath)).get_seq( chromosome, start, end ) if genome.annotation_file_abspath and genome.annotation_file_abspath.exists(): ann_db = _prepare_annotation(genome.annotation_file_abspath, as_df=False) locus_annotation = ann_db.intersect( pyranges.PyRanges(chromosomes=[chromosome], starts=[start], ends=[end]) ).df # TODO: rename Name to Exon not necessarily applicable to all annotations if genome.annotation_file_abspath.suffix.lower() in [".gff3", ".gff"]: locus_annotation = locus_annotation.rename(columns={"Name": "Exon"}) # type: ignore elif genome.annotation_file_abspath.suffix.lower() in [".gtf"]: locus_annotation = locus_annotation.rename(columns={"gene_id": "ID", "exon_number": "Exon"}) # type: ignore if len(locus_annotation) > 0: return Locus( genome=genome, sequence=locus_sequence, annotation=locus_annotation ) else: return Locus( genome=genome, sequence=locus_sequence, annotation=None ) # TODO: refactor else: return Locus(genome=genome, sequence=locus_sequence, annotation=None)
[docs] def locus_from_sequence(sequence, sequence_name=None): """Create a locus from sequence. Parameters ---------- sequence : str DNA sequence sequence_name : str, optional Sequence name, by default None Returns ------- Locus Object representing a locus from given sequence. """ return Locus(sequence=sequence, name=sequence_name)
[docs] def locus_from_gene(genome, gene_name): """Create a locus from gene name. If annotation file is provided, it will be used to annotate the locus. Parameters ---------- genome : Genome Genome object. Can be created using `Genome` class. gene_name : str Gene name. Needs to be present in the annotation file. Returns ------- Locus Locus object. """ if genome.annotation_file_abspath.exists(): try: ann_db = _prepare_annotation(genome.annotation_file_abspath, as_df=True) gene_annotation = ann_db.query( "ID == @gene_name & (Feature == 'protein_coding_gene' | Feature == 'gene')" ) chromosome = gene_annotation.Chromosome.values[0] start = int(gene_annotation.Start) + 1 # pyranges is 0-based end = int(gene_annotation.End) locus_annotation = ann_db.query( f"((ID == @gene_name) | (Parent == @gene_name) | (gene_id == @gene_name)) & \ (Chromosome == @chromosome) & \ (((Start >= {start}) & (Start <= {end})) | \ ((End >= {start}) & (End <= {end})))" ) locus_sequence = Fasta( str(genome.genome_file_abspath), one_based_attributes=True ).get_seq(chromosome, start, end) return Locus( genome=genome, sequence=locus_sequence, annotation=locus_annotation ) except Exception as e: raise ValueError(f"Gene {gene_name} not found. (Error: {e})") else: raise ValueError("Annotation file not valid.") # TODO: fix message