Source code for metage2metabo.m2m.individual_scope

# Copyright (C) 2019-2024 Clémence Frioux & Arnaud Belcour - Inria Dyliss - Pleiade - Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>

import csv
import json
import logging
import os
import statistics
import sys
import time
import traceback
import xml.etree.ElementTree as etree

from menetools import run_menescope
from menetools.sbml import readSBMLspecies_clyngor

from metage2metabo import utils

from multiprocessing import Pool

logger = logging.getLogger(__name__)
logging.getLogger("menetools").setLevel(logging.CRITICAL)


[docs] def iscope(sbmldir, seeds, out_dir, cpu_number=1): """Compute individual scopes (reachable metabolites) for SBML files in a directory. Args: sbmldir (str): SBML files directory seeds (str): SBML seeds file out_dir (str): output directory cpu_number (int): number of CPU to use for multiprocessing Returns: set: union of reachable metabolites for all metabolic networks """ # Run individual scopes of metabolic networks if any starttime = time.time() if len([ name for name in os.listdir(sbmldir) if os.path.isfile(os.path.join(sbmldir, name)) and utils.get_extension(os.path.join(sbmldir, name)).lower() in ["xml", "sbml"] ]) > 1: scope_dict, seeds_dict, scope_json, seeds_status_path = indiv_scope_run(sbmldir, seeds, out_dir, cpu_number) logger.info(f'\nIndividual scopes for all metabolic networks available in {scope_json}. The scopes have been filtered a way that if a seed is in a scope, it means the corresponding species is predicted to be able to produce it.') logger.info(f'\nInformation regarding the producibility of seeds, and the possible absence of seeds in some metabolic networks is stored in {seeds_status_path}.\n') # Analyze the individual scopes results (json file) reachable_metabolites_union = analyze_indiv_scope(scope_dict, seeds_dict, seeds) # Compute the reverse iscopes (who produces each metabolite) reverse_scope_json, reverse_scope_tsv = reverse_scope(scope_dict, out_dir) logger.info(f"\nAnalysis of functional redundancy (producers of all metabolites) is computed as a dictionary in {reverse_scope_json} and as a matrix in {reverse_scope_tsv}.") logger.info("--- Indiv scopes runtime %.2f seconds ---\n" % (time.time() - starttime)) return reachable_metabolites_union else: logger.critical("The number of SBML files is <= 1 in the input directory") sys.exit(1)
[docs] def indiv_scope_run(sbml_dir, seeds, output_dir, cpu_number=1): """Run Menetools and analyse individual metabolic capabilities. Args: sbml_dir (str): directory of SBML files seeds (str): SBML seeds file output_dir (str): directory for results cpu_number (int): number of CPU to use for multiprocessing Returns: str: path to output file for scope from Menetools analysis """ logger.info('\n###############################################') logger.info('# #') logger.info('# Individual metabolic potentials #') logger.info('# #') logger.info('###############################################\n') menetools_dir = os.path.join(output_dir, 'indiv_scopes') indiv_scopes_path = os.path.join(menetools_dir, 'indiv_scopes.json') produced_seeds_path = os.path.join(menetools_dir, 'seeds_in_indiv_scopes.json') if not utils.is_valid_dir(menetools_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) all_files = [ f for f in os.listdir(sbml_dir) if os.path.isfile(os.path.join(sbml_dir, f)) and utils.get_extension( os.path.join(sbml_dir, f)).lower() in ['xml', 'sbml'] ] all_scopes = {} all_produced_seeds = {} all_absent_seeds = {} all_non_produced_seeds = {} multiprocessing_indiv_scopes = [] for f in all_files: bname = utils.get_basename(f) sbml_path = os.path.join(sbml_dir, f) multiprocessing_indiv_scopes.append((sbml_path, bname, seeds)) menescope_pool = Pool(cpu_number) results = menescope_pool.starmap(indiv_scope_on_species, multiprocessing_indiv_scopes) for result in results: error = result[0] if error is True: logger.critical('------------An error occurred during M2M run of Menetools, M2M will stop-------------') menescope_pool.close() menescope_pool.join() sys.exit(1) bname = result[1] menescope_results = result[2] all_scopes[bname] = menescope_results['scope'] all_produced_seeds[bname] = menescope_results['produced_seeds'] all_absent_seeds[bname] = menescope_results['absent_seeds'] all_non_produced_seeds[bname] = menescope_results['non_produced_seeds'] menescope_pool.close() menescope_pool.join() seeds_status = {} seeds_status['individually_producible_seeds'] = all_produced_seeds seeds_status['seeds_absent_in_metabolic_network'] = all_absent_seeds seeds_status['individually_non_producible_seeds'] = all_non_produced_seeds # Some seeds might be producible by some metabolic networks. By default, menescope include all seeds in the scope, regardless of their real producibility by the network. seed_status_dict holds this information. # We'll remove them here # remove from each scope the seeds that are not producible by the corresponding species. for species in all_scopes: # non producible seeds are in seeds_status_dict['individually_non_producible_seeds'][species] all_scopes[species] = list(set(all_scopes[species]) - set(seeds_status['individually_non_producible_seeds'][species])) with open(indiv_scopes_path, 'w') as dumpfile: json.dump(all_scopes, dumpfile, indent=4) with open(produced_seeds_path, 'w') as dumpfile: json.dump(seeds_status, dumpfile, indent=4, sort_keys=True) return all_scopes, seeds_status, indiv_scopes_path, produced_seeds_path
[docs] def indiv_scope_on_species(sbml_path, bname, seeds_path): """Run Menetools and analyse individual metabolic capabilities on a sbml. Args: sbml_path (str): path to SBML file bname (str): name linked to SBML file seeds_path (str): path to SBML seeds file Returns: list: [boolean error, bname, dictionary containing menescope results] """ error = False try: menescope_results = run_menescope( draft_sbml=sbml_path, seeds_sbml=seeds_path) except: traceback_str = traceback.format_exc() # Don't print the traceback if the error is linked to SystemExit as the error has been hanled by menetools. if 'SystemExit: 1' not in traceback_str: logger.critical(traceback_str) logger.critical('---------------Something went wrong running Menetools on ' + bname + '---------------') error = True menescope_results = None return [error, bname, menescope_results]
[docs] def analyze_indiv_scope(scope_dict, seeds_status_dict, seeds): """Analyze the output of Menescope, stored in two dictionaries Args: scope_dict (dict): output of all menescope runs seeds_status_dict (dict): production status of seeds in all menescope runs seeds (str): SBML seeds file Returns: set: union of all the individual scopes """ scope_dict_set = {} individually_producible_seeds_set = {} for elem in scope_dict: scope_dict_set[elem] = set(scope_dict[elem]) for elem in seeds_status_dict['individually_producible_seeds']: individually_producible_seeds_set[elem] = set(seeds_status_dict['individually_producible_seeds'][elem]) try: seed_metabolites = readSBMLspecies_clyngor(seeds, 'seeds') except FileNotFoundError: logger.critical('File not found: '+ seeds) sys.exit(1) except etree.ParseError: logger.critical(f'Invalid syntax in SBML file: {seeds}') sys.exit(1) except: traceback_str = traceback.format_exc() # Don't print the traceback if the error is linked to SystemExit as the error has been handled by menetools. if 'SystemExit: 1' not in traceback_str: logger.critical(traceback_str) logger.critical('---------------Something went wrong running Menetools on " + seeds + "---------------') sys.exit(1) logger.info('%i metabolic models considered.' %(len(scope_dict_set))) intersection_scope = set.intersection(*list(scope_dict_set.values())) logger.info('\n' + str(len(intersection_scope)) + ' metabolites in core reachable by all organisms (intersection) \n') logger.info("\n".join(intersection_scope)) union_scope = set.union(*list(scope_dict_set.values())) union_producible_seeds = set.union(*list(individually_producible_seeds_set.values())) logger.info('\n' + str(len(union_scope)) + ' metabolites reachable by individual organisms altogether (union), among which ' + str(len(union_producible_seeds)) + ' metabolites that are also part of the seeds (growth medium) \n') logger.info("\n".join(union_scope)) len_scope = [len(scope_dict[elem]) for elem in scope_dict] logger.info('\nSummary:') logger.info('- intersection of scopes ' + str(len(intersection_scope))) logger.info('- union of scopes ' + str(len(union_scope))) logger.info('- max metabolites in scopes ' + str(max(len_scope))) logger.info('- min metabolites in scopes ' + str(min(len_scope))) logger.info('- average number of metabolites in scopes %.2f (+/- %.2f)' % (statistics.mean(len_scope), statistics.stdev(len_scope))) return union_scope
[docs] def reverse_scope(scope_dict, output_dir): """Reverse a scope dictionary by focusing on metabolite producers. Args: scope_dict (dict): dict of scope output_dir (str): path to output directory Returns: (str, str): paths to the JSON and TSV outputs """ rev_indiv_scopes_json_path = os.path.join(*[output_dir, 'indiv_scopes', 'rev_iscope.json']) rev_indiv_scopes_tsv_path = os.path.join(*[output_dir, 'indiv_scopes', 'rev_iscope.tsv']) new_dic = {} for k,v in scope_dict.items(): for x in v: new_dic.setdefault(x,[]).append(k) with open(rev_indiv_scopes_json_path, 'w') as g: json.dump(new_dic, g, indent=True, sort_keys=True) all_compounds = [compound for compound in new_dic] all_species = [species for species in scope_dict] # For each species get the possibility of production of each compounds. with open(rev_indiv_scopes_tsv_path, 'w') as output_file: csvwriter = csv.writer(output_file, delimiter='\t') csvwriter.writerow(['', *all_compounds]) for species in all_species: csvwriter.writerow([species, *[1 if species in new_dic[compound] else 0 for compound in all_compounds]]) return rev_indiv_scopes_json_path, rev_indiv_scopes_tsv_path