Source code for metage2metabo.m2m_analysis.taxonomy

# Copyright (C) 2019-2024 Clémence Frioux & Arnaud Belcour - Inria Dyliss - Pleiade - Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>

import csv
import time
import sys
import logging
from ete3 import NCBITaxa

logger = logging.getLogger(__name__)


[docs] def get_taxon(taxonomy_file_path): """From the taxonomy file (created by extract_taxa) create a dictionary and a list linking taxon and species Args: taxonomy_file_path (str): path to the taxonomy_file Returns: taxon_named_species (dict): associate organism ID as key with taxon name as value all_taxa (list): list all taxa in file """ taxon_named_species = {} all_taxa = [] with open(taxonomy_file_path, "r") as taxonomy_file: taxonomy_reader = csv.reader(taxonomy_file, delimiter="\t", quotechar="|") next(taxonomy_reader) for row in taxonomy_reader: taxonomy = row[2].split('__')[0] taxon_named_species[row[0]] = row[2] if taxonomy not in all_taxa: all_taxa.append(taxonomy) return taxon_named_species, all_taxa
[docs] def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file, taxonomy_level="phylum"): """From NCBI taxon ID, extract taxonomy rank and create a tree file Args: mpwt_taxon_file (str): mpwt taxon file for species in sbml folder taxon_output_file (str): path to taxonomy output file tree_output_file (str): path to tree output file taxonomy_level (str): taxonomy level, must be: phylum, class, order, family, genus or species. """ starttime = time.time() logger.info('######### Extract taxon information from {0}. #########'.format(mpwt_taxon_file)) ncbi = NCBITaxa() # Map the taxonomy level to the taxonomy index in the ranks list. map_taxonomy_index = {'phylum': 0, 'class': 1, 'order': 2, 'family': 3, 'genus': 4, 'species': 5} taxonomy_index = map_taxonomy_index[taxonomy_level] taxon_ids = [] taxon_count = {} taxonomy_file_datas = [] with open(mpwt_taxon_file, "r") as taxon_file: dict_reader = csv.DictReader(taxon_file, delimiter="\t") headers = set(dict_reader.fieldnames) if set(['taxon_id', 'species']).issubset(headers): for line in dict_reader: if line['taxon_id'] is not None and line['taxon_id'] != '': taxon_ids.append(line['taxon_id']) lineage = ncbi.get_lineage(line['taxon_id']) lineage2ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) ranks2lineage = dict((rank, names[taxid]) for (taxid, rank) in lineage2ranks.items()) ranks = [ranks2lineage.get(rank, "unknown") for rank in ["phylum", "class", "order", "family", "genus", "species"]] if ranks[taxonomy_index] != "unknown": taxon = ranks[taxonomy_index] else: taxon = "unknown" taxon = taxon.replace(' ', '_').replace('.', '') if taxon not in taxon_count: taxon_count[taxon] = 1 else: taxon_count[taxon] += 1 row = ([line['species'], line['taxon_id']] + [taxon + '__' + str(taxon_count[taxon])] + ranks) taxonomy_file_datas.append(row) else: logger.warning('No taxon_id for {0} in file {1}.'.format(line['species'], mpwt_taxon_file)) else: logger.critical('ERROR: No headers "taxon_id" and/or "species" in taxon file {0}.'.format(mpwt_taxon_file)) sys.exit() with open(taxon_output_file, "w") as taxonomy_file: csvwriter = csv.writer(taxonomy_file, delimiter="\t") csvwriter.writerow(["organism_id", "taxid", "taxon_number", "phylum", "class", "order", "family", "genus", "species"]) for taxonomy_file_data in taxonomy_file_datas: csvwriter.writerow(taxonomy_file_data) tree = ncbi.get_topology(taxon_ids) with open(tree_output_file, "w") as tree_file: tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"])) logger.info( "--- Taxonomy runtime %.2f seconds ---\n" % (time.time() - starttime))