Source code for metage2metabo.m2m.reconstruction

# Copyright (C) 2019-2024 Clémence Frioux & Arnaud Belcour - Inria Dyliss - Pleiade - Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>

import csv
import logging
import os
import statistics
import sys
import time
import xml.etree.ElementTree as etree

from metage2metabo import utils, sbml_management

from mpwt.mpwt_workflow import multiprocess_pwt
from mpwt.utils import cleaning_input, remove_pgdbs

from multiprocessing import Pool

from padmet.classes.padmetSpec import PadmetSpec
from padmet.utils import sbmlPlugin

from menetools.sbml import get_model, get_listOfSpecies

from shutil import copyfile

logger = logging.getLogger(__name__)
logging.getLogger("mpwt").setLevel(logging.INFO)


[docs] def recon(inp_dir, out_dir, noorphan_bool, padmet_bool, sbml_level, nb_cpu, clean, use_pwt_xml): """Run metabolic network reconstruction with Pathway Tools and get SBMLs. Args: inp_dir (str): genomes directory out_dir (str): results directory noorphan_bool (bool): ignores orphan reactions if True padmet_bool (bool): creates padmet files if True sbml_level (str): SBML level (2 or 3) nb_cpu (int): number of CPU for multiprocessing clean (bool): re-run metabolic reconstructions that are already available if found use_pwt_xml (bool): use Pathway Tools XML instead of creating them with padmet Returns: tuple: PGDB directory (str), SBML directory (str) """ starttime = time.time() logger.info('\n###############################################') logger.info('# #') logger.info('# Metabolic network reconstruction #') logger.info('# #') logger.info('###############################################\n') if use_pwt_xml and padmet_bool: logger.critical("-p/padmet_bool and --pwt-xml/use_pwt_xml are incompatible arguments") sys.exit(1) # Create PGDBs pgdb_dir = genomes_to_pgdb(inp_dir, out_dir, nb_cpu, clean, use_pwt_xml) if use_pwt_xml: sbml_dir = os.path.join(out_dir, 'sbml') if not os.path.exists(sbml_dir): os.mkdir(sbml_dir) for xml_file in os.listdir(pgdb_dir): input_xml_path = os.path.join(pgdb_dir, xml_file) output_xml_path = os.path.join(sbml_dir, xml_file.replace('.xml', '.sbml')) copyfile(input_xml_path, output_xml_path) padmet_folder = None else: # Create SBMLs from PGDBs sbml_dir = sbml_management.pgdb_to_sbml(pgdb_dir, out_dir, noorphan_bool, padmet_bool, sbml_level, nb_cpu) padmet_folder = os.path.join(out_dir, 'padmet') output_stat_file = os.path.join(out_dir, 'recon_stats.tsv') analyze_recon(sbml_dir, output_stat_file, padmet_folder, padmet_bool, nb_cpu) logger.info( "--- Recon runtime %.2f seconds ---\n" % (time.time() - starttime)) return pgdb_dir, sbml_dir, padmet_folder
[docs] def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean, use_pwt_xml): """Run Pathway Tools on each genome of the repository Args: genomes_dir (str): genome repository output_dir (str): output repository cpu (int): number of CPUs to use clean (bool): delete PGDBs in ptools-local coresponding to the input data use_pwt_xml (bool): use Pathway Tools XML instead of creating them with padmet Returns: pgdb_dir (str): pgdb repository """ logger.info( "######### Running metabolic network reconstruction with Pathway Tools #########" ) if not os.path.isdir(genomes_dir): logger.critical("Genomes directory path does not exist.") sys.exit(1) pgdb_dir = os.path.join(output_dir, 'pgdb') log_dir = os.path.join(output_dir, 'pgdb_log') ncbirc_path = os.path.join(os.path.expanduser('~'), '.ncbirc') log_path = os.path.join(log_dir, 'log_error.txt') if not utils.is_valid_dir(pgdb_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) if not utils.check_program('pathway-tools'): logger.critical( 'Pathway Tools is not in the PATH, please fix it before using the program' ) sys.exit(1) if not utils.check_program("blastp"): logger.critical( 'blastp is not in the PATH, please fix it before using the program' ) sys.exit(1) if not utils.is_valid_file(ncbirc_path): logger.critical( f'No {ncbirc_path} file, please fix it before using the program' ) sys.exit(1) genomes_pgdbs = [genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir)] if clean: remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu) cleaning_input(genomes_dir, verbose=False) # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)] if set(pgdb_dirs) == set(genomes_pgdbs): logger.warning("PGDBs are already created and will be used. To overrun them, run m2m with --clean option") return pgdb_dir taxon_file = None if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]): taxon_file = True if use_pwt_xml: move_dat = False move_xml = True else: move_dat = True move_xml = False multiprocess_pwt(genomes_dir, pgdb_dir, patho_inference=True, patho_hole_filler=False, patho_operon_predictor=False, no_download_articles=False, flat_creation=True, dat_extraction=move_dat, xml_extraction=move_xml, owl_extraction=False, col_extraction=False, size_reduction=False, number_cpu=cpu, taxon_file=taxon_file, patho_log=log_dir, verbose=False) nb_genomes_dir = len([folder for folder in os.listdir(genomes_dir) if os.path.isdir(os.path.join(genomes_dir, folder))]) if use_pwt_xml: nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isfile(os.path.join(pgdb_dir, folder))]) logger.warning("Adding prefix M_ to XML form Pathway Tools.") for xml_file in os.listdir(pgdb_dir): xml_path = os.path.join(pgdb_dir, xml_file) update_pathway_tools_xml(xml_path, xml_path) else: nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isdir(os.path.join(pgdb_dir, folder))]) if nb_pgdb_dir != nb_genomes_dir: if os.path.exists(log_path): logger.critical("Something went wrong running Pathway Tools. See the log file in " + log_path) else: logger.critical("Something went wrong running Pathway Tools.") sys.exit(1) return (pgdb_dir)
[docs] def create_padmet_stat(species_name, padmet_file): """Extract reactions/pathways/compounds/genes from a padmet file. Args: species_name (str): species names padmet_file (str): path to a padmet file Returns list: [species name, list of genes, list of reactions, list of reactions associated with genes, list of compounds, list of pathways] """ padmetSpec = PadmetSpec(padmet_file) total_pwy_id = set() total_cpd_id = set() all_rxns = [node for node in padmetSpec.dicOfNode.values() if node.type == "reaction"] all_genes = [node.id for node in padmetSpec.dicOfNode.values() if node.type == "gene"] gene_associated_rxns = [] genes_with_rxns = [] rxns = [] for rxn_node in all_rxns: total_cpd_id.update([rlt.id_out for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type in ["consumes","produces"]]) pathways_ids = set([rlt.id_out for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type == "is_in_pathway"]) rxns.append(rxn_node.id) if any([rlt for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type == "is_linked_to"]): genes_with_rxns.extend([rlt.id_out for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type == "is_linked_to"]) gene_associated_rxns.append(rxn_node.id) total_pwy_id.update(pathways_ids) all_pwys = [node_id for (node_id, node) in padmetSpec.dicOfNode.items() if node_id in total_pwy_id] all_cpds = [node_id for (node_id, node) in padmetSpec.dicOfNode.items() if node_id in total_cpd_id] genes_with_rxns = set(genes_with_rxns) return [species_name, genes_with_rxns, rxns, gene_associated_rxns, all_cpds, all_pwys]
[docs] def create_sbml_stat(species_name, sbml_file): """Extract reactions/pathways/compounds/genes from a sbml file. Args: species_name (str): species names sbml_file (str): path to a sbml file Returns list: [species name, list of genes, list of reactions, list of reactions associated with genes, list of compounds] """ tree = etree.parse(sbml_file) sbml = tree.getroot() genes = [] reactions = [] gene_associated_rxns = [] fbc_gene_associated_rxns = [] fbc_rxn_associated_genes = [] compounds = [] for e in sbml: if e.tag[0] == "{": uri, tag = e.tag[1:].split("}") else: tag = e.tag if tag == "model": model_element = e for els in model_element: if 'listOfSpecies' in els.tag: for el in els: compounds.append(sbmlPlugin.convert_from_coded_id(el.get('metaid'))[0]) if 'listOfReactions' in els.tag: for el in els: reaction_id = sbmlPlugin.convert_from_coded_id(el.get('id'))[0] reactions.append(reaction_id) for subel in el.iter(): if 'notes' in subel.tag: for subsubel in subel.iter(): for subsubsubel in subsubel.iter(): if 'GENE_ASSOCIATION' in subsubsubel.text: for gene in sbmlPlugin.parseGeneAssoc(subsubsubel.text): if gene not in genes: genes.append(gene.replace('GENE_ASSOCIATION:', '')) if reaction_id not in gene_associated_rxns: gene_associated_rxns.append(reaction_id) # Use geneProductAssociation for xml from MetaFlux. elif 'geneProductAssociation' in subel.tag: for subsubel in subel.iter(): if 'geneProductRef' in subsubel.tag: gene = subsubel.get('{http://www.sbml.org/sbml/level3/version1/fbc/version2}geneProduct') if gene: gene = gene.replace('G_', '') if gene not in fbc_rxn_associated_genes: fbc_rxn_associated_genes.append(gene) if reaction_id not in fbc_gene_associated_rxns: fbc_gene_associated_rxns.append(reaction_id) else: for subsubsubel in subsubel.iter(): gene = subsubsubel.get('{http://www.sbml.org/sbml/level3/version1/fbc/version2}geneProduct') if gene: gene = gene.replace('G_', '') if gene not in fbc_rxn_associated_genes: fbc_rxn_associated_genes.append(gene) if reaction_id not in fbc_gene_associated_rxns: fbc_gene_associated_rxns.append(reaction_id) # For XML from MetaFlux, use genes from geneProductAssociation to get genes and reaction with genes. if len(genes) == 0: if len(fbc_rxn_associated_genes) > 0: genes = fbc_rxn_associated_genes if len(gene_associated_rxns) == 0: if len(fbc_gene_associated_rxns) > 0: gene_associated_rxns = fbc_gene_associated_rxns return [species_name, genes, reactions, gene_associated_rxns, compounds]
[docs] def mean_sd_data(datas): """Compute the mean and standard deviation from a list. Args: datas (list): list of integer/float Returns mean_data (float): mean of the list sd_data (flaot): standard deviation of the lsit """ if len(datas) >1: mean_data = "{0:.2f}".format(statistics.mean(datas)) sd_data = "(+/- {0:.2f})".format(statistics.stdev(datas)) else: logger.info("No mean and standard deviation on one sample.") mean_data = None sd_data = None return mean_data, sd_data
[docs] def update_pathway_tools_xml(input_sbml, output_sbml): """ Update XML from Pathway Tools by adding a 'M_' prefix to avoid issue, when using metaboltie IDs. Args: input_sbml (str): path to xml input file output_sbml (str): path to xml output file """ tree = etree.parse(input_sbml) sbml = tree.getroot() model = get_model(sbml) speciesids = [species.attrib.get("id") for species in get_listOfSpecies(model)] speciesids.sort(key=len, reverse=True) with open(input_sbml, 'r') as open_sbml: open_sbml_str = open_sbml.read() for species in speciesids: open_sbml_str = open_sbml_str.replace(species, 'M_'+species) with open(output_sbml, 'w') as open_sbml_out: open_sbml_out.write(open_sbml_str)
[docs] def analyze_recon(sbml_folder, output_stat_file, padmet_folder=None, padmet_bool=None, nb_cpu=1): """Analyze the sbml and/or the padmet files after metabolic network reconstruction. And write the result in a file. Args: sbml_folder (str): directory of SBML files output_stat_file (str): path to output stat file padmet_folder (str): directory of PADMET files padmet_bool (bool): use or not the padmet files nb_cpu (int): number of CPU to use """ analyze_pool = Pool(processes=nb_cpu) if padmet_bool and padmet_folder: genes = {} reactions = {} gene_associated_reactions = {} compounds = {} pathways = {} multiprocessing_data = [] if os.listdir(padmet_folder) == 0: logger.critical("No padmet in " + padmet_folder) sys.exit(1) for padmet in os.listdir(padmet_folder): padmet_file = os.path.join(padmet_folder, padmet) species_name = padmet.replace('.padmet', '') multiprocessing_data.append((species_name, padmet_file)) recon_stats = analyze_pool.starmap(create_padmet_stat, multiprocessing_data) with open(output_stat_file, 'w') as micro_file: csvwriter = csv.writer(micro_file, delimiter='\t') csvwriter.writerow(['species', 'nb_reactions', 'nb_reactions_with_genes', 'nb_genes', 'nb_compounds', 'nb_pathways']) for recon_stat in recon_stats: species_name = recon_stat[0] genes[species_name] = recon_stat[1] reactions[species_name] = recon_stat[2] gene_associated_reactions[species_name] = recon_stat[3] compounds[species_name] = recon_stat[4] pathways[species_name] = recon_stat[5] csvwriter.writerow([species_name, len(reactions[species_name]), len(gene_associated_reactions[species_name]), len(genes[species_name]), len(compounds[species_name]), len(pathways[species_name])]) else: genes = {} reactions = {} compounds = {} pathways = None gene_associated_reactions = {} multiprocessing_data = [] if os.listdir(sbml_folder) == 0: logger.critical("No sbml in " + sbml_folder) sys.exit(1) for sbml in os.listdir(sbml_folder): species_name = sbml.replace('.sbml','') sbml_file = os.path.join(sbml_folder, sbml) multiprocessing_data.append((species_name, sbml_file)) sbml_stats = analyze_pool.starmap(create_sbml_stat, multiprocessing_data) with open(output_stat_file, 'w') as micro_file: csvwriter = csv.writer(micro_file, delimiter='\t') csvwriter.writerow(['species', 'nb_reactions', 'nb_reactions_with_genes', 'nb_genes', 'nb_compounds']) for sbml_stat in sbml_stats: species_name = sbml_stat[0] genes[species_name] = set(sbml_stat[1]) reactions[species_name] = set(sbml_stat[2]) gene_associated_reactions[species_name] = set(sbml_stat[3]) compounds[species_name] = set(sbml_stat[4]) csvwriter.writerow([species_name, len(reactions[species_name]), len(gene_associated_reactions[species_name]), len(genes[species_name]), len(compounds[species_name])]) analyze_pool.close() analyze_pool.join() logger.info("######### Stats GSMN reconstruction #########") if len(genes) == len(reactions) and len(genes) == len(compounds) and len(reactions) == len(compounds): logger.info("Number of genomes: " + str(len(genes))) dataset_all_reactions = set([reaction for species_name in reactions for reaction in reactions[species_name]]) logger.info("Number of reactions in all GSMN: " + str(len(dataset_all_reactions))) dataset_all_compounds = set([compound for species_name in compounds for compound in compounds[species_name]]) logger.info("Number of compounds in all GSMN: " + str(len(dataset_all_compounds))) species_reactions = [len(reactions[species_name]) for species_name in reactions] if len(species_reactions) > 1: mean_species_reactions, sd_species_reactions = mean_sd_data(species_reactions) if mean_species_reactions and sd_species_reactions: logger.info("Average reactions per GSMN: " + mean_species_reactions + sd_species_reactions) else: logger.info("Number of reactions in GSMN: " + str(species_reactions[0])) species_compounds = [len(compounds[species_name]) for species_name in compounds] if len(species_compounds) > 1: mean_species_compounds, sd_species_compounds = mean_sd_data(species_compounds) if mean_species_compounds and sd_species_compounds: logger.info("Average compounds per GSMN: " + mean_species_compounds + sd_species_compounds) else: logger.info("Number of compounds in GSMN: " + str(species_compounds[0])) species_genes = [len(genes[species_name]) for species_name in genes] if len(species_genes) > 1: mean_species_genes, sd_species_genes = mean_sd_data(species_genes) if mean_species_genes and sd_species_genes: logger.info("Average genes per GSMN: " + mean_species_genes + sd_species_genes) else: logger.info("Number of genes in GSMN: " + str(species_genes[0])) if pathways: species_pathways = [len(pathways[species_name]) for species_name in pathways] if len(species_pathways) > 1: mean_species_pathways, sd_species_pathways = mean_sd_data(species_pathways) if mean_species_pathways and sd_species_pathways: logger.info("Average pathways per GSMN: " + mean_species_pathways + sd_species_pathways) else: logger.info("Number of pathways in GSMN: " + str(species_pathways[0])) gene_reactions_assoc_percentages = [] for species_name in reactions: if len(reactions[species_name]) > 0: gene_reactions_assoc_percentages.append(((len(gene_associated_reactions[species_name]) / len(reactions[species_name]))*100)) else: gene_reactions_assoc_percentages.append(0) logger.info('Warning: ' + species_name + ' metabolic network contains 0 reactions.') if len(gene_reactions_assoc_percentages) > 1: mean_gene_reactions_assoc_percentages, sd_gene_reactions_assoc_percentages = mean_sd_data(gene_reactions_assoc_percentages) if mean_gene_reactions_assoc_percentages and sd_gene_reactions_assoc_percentages: logger.info('Percentage of reactions associated with genes: ' + mean_gene_reactions_assoc_percentages + sd_gene_reactions_assoc_percentages) else: logger.info('Percentage of reactions associated with genes: ' + str(gene_reactions_assoc_percentages[0]))