Source code for metage2metabo.m2m_analysis.solution_graph

# Copyright (C) 2019-2024 Clémence Frioux & Arnaud Belcour - Inria Dyliss - Pleiade - Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>

import csv
import json
import logging
import networkx as nx
import os
import sys
import time

from itertools import combinations
from metage2metabo import utils, sbml_management
from metage2metabo.m2m_analysis.taxonomy import get_taxon, extract_taxa
from operator import add

logger = logging.getLogger(__name__)


[docs] def graph_analysis(json_file_folder, target_folder_file, output_dir, taxon_file=None, taxonomy_level="phylum"): """Run the graph creation on miscoto output Args: json_file_folder (str): json file or folder containing multiple jsons target_folder_file (str): targets file or folder containing multiple sbmls output_dir (str): results directory taxon_file (str): mpwt taxon file for species in sbml folder taxonomy_level (str): taxonomy level, must be: phylum, class, order, family, genus or species. Returns: str: path to folder containing gml results """ starttime = time.time() logger.info('\n###############################################') logger.info('# #') logger.info('# Solution graph creation #') logger.info('# #') logger.info('###############################################\n') target_paths = utils.file_or_folder(target_folder_file) json_paths = utils.file_or_folder(json_file_folder) gml_output = os.path.join(output_dir, 'gml') if taxon_file: taxonomy_output_file = os.path.join(output_dir, 'taxonomy_species.tsv') tree_output_file = os.path.join(output_dir, 'taxon_tree.txt') extract_taxa(taxon_file, taxonomy_output_file, tree_output_file, taxonomy_level) else: taxonomy_output_file = None create_gml(json_paths, target_paths, output_dir, taxonomy_output_file) logger.info( "--- Graph runtime %.2f seconds ---\n" % (time.time() - starttime)) return gml_output
[docs] def create_gml(json_paths, target_paths, output_dir, taxon_file=None): """Create solution graph from miscoto output and compute stats Args: json_paths (str): {target: path_to_corresponding_json} target_paths (str): {target: path_to_corresponding_sbml} output_dir (str): results directory taxon_file (str): mpwt taxon file for species in sbml folder """ miscoto_stat_output = os.path.join(output_dir, 'miscoto_stats.txt') key_species_stats_output = os.path.join(output_dir,'key_species_stats.tsv') key_species_json = os.path.join(output_dir, 'key_species.json') gml_output = os.path.join(output_dir, 'gml') if not utils.is_valid_dir(gml_output): logger.critical('Impossible to access/create output directory') sys.exit(1) len_min_sol = {} len_union = {} len_intersection = {} len_solution = {} len_target = {} target_categories = {} for target in target_paths: target_categories[target] = sbml_management.get_compounds(target_paths[target]) if taxon_file: taxon_named_species, all_taxons = get_taxon(taxon_file) else: taxon_named_species = None all_taxons = None key_species_data = {} miscoto_stat_output_datas = [] for target_category in target_categories: key_species_data[target_category] = {} key_species_data[target_category]['essential_symbionts'] = {} key_species_data[target_category]['alternative_symbionts'] = {} target_output_gml_path = os.path.join(gml_output, target_category + '.gml') with open(json_paths[target_category]) as json_data: dicti = json.load(json_data) G = nx.Graph() added_node = [] species_weight = {} if dicti['still_unprod'] != []: logger.error('ERROR ', dicti["still_unprod"], ' is unproducible') logger.error('ERROR: Please remove these unproducible targets ({0}) from the targets file, delete output folder ({1}) re-run m2m_analysis'.format(','.join(dicti["still_unprod"]), output_dir)) sys.exit(1) len_target[target_category] = len(dicti['newly_prod']) + len(dicti['still_unprod']) len_min_sol[target_category] = len(dicti['bacteria']) len_union[target_category] = len(dicti['union_bacteria']) len_intersection[target_category] = len(dicti['inter_bacteria']) key_species_types = {organism:'ES' if organism in dicti['inter_bacteria'] else 'AS' for organism in dicti['union_bacteria']} if taxon_file: for taxon in all_taxons: key_species_data[target_category]['essential_symbionts'][taxon] = [organism for organism in key_species_types if key_species_types[organism] == 'ES' and taxon_named_species[organism].split('__')[0] == taxon] key_species_data[target_category]['alternative_symbionts'][taxon] = [organism for organism in key_species_types if key_species_types[organism] == 'AS' and taxon_named_species[organism].split('__')[0] == taxon] else: key_species_data[target_category]['essential_symbionts']['data'] = [organism for organism in key_species_types if key_species_types[organism] == 'ES'] key_species_data[target_category]['alternative_symbionts']['data'] = [organism for organism in key_species_types if key_species_types[organism] == 'AS'] len_solution[target_category] = len(dicti['enum_bacteria']) for sol in dicti['enum_bacteria']: if len(dicti['enum_bacteria'][sol]) > 1: for species_1, species_2 in combinations(dicti['enum_bacteria'][sol], 2): if species_1 not in added_node: if taxon_file: G.add_node(taxon_named_species[species_1], note=key_species_types[species_1]) else: G.add_node(species_1, note=key_species_types[species_1]) added_node.append(species_1) if species_2 not in added_node: if taxon_file: G.add_node(taxon_named_species[species_2], note=key_species_types[species_2]) else: G.add_node(species_2, note=key_species_types[species_2]) added_node.append(species_2) combination_species = '_'.join(sorted([species_1, species_2])) if combination_species not in species_weight: species_weight[combination_species] = 1 else: species_weight[combination_species] += 1 if taxon_file: G.add_edge(taxon_named_species[species_1], taxon_named_species[species_2], weight=species_weight[combination_species]) else: G.add_edge(species_1, species_2, weight=species_weight[combination_species]) elif len(dicti['enum_bacteria'][sol]) == 1: species_1 = dicti['enum_bacteria'][sol][0] if species_1 not in added_node: if taxon_file: G.add_node(taxon_named_species[species_1], note=key_species_types[species_1]) else: G.add_node(species_1, note=key_species_types[species_1]) added_node.append(species_1) # Check if all the nodes of G are not isolates. if len(G.nodes) == nx.number_of_isolates(G): logger.critical(r'/!\ Warning: All the nodes of the solution graph are isolated (they are not connected to other nodes). This lead to powergrasp creating an empty powergraph.') logger.critical('So m2m_analysis stops at the solution graph step.') sys.exit(1) miscoto_stat_output_datas.append([target_category, str(len_target[target_category]), str(len_min_sol[target_category]), str(len_union[target_category]), str(len_intersection[target_category]), str(len_solution[target_category])]) logger.info('######### Graph of ' + target_category + ' #########') logger.info('Number of nodes: ' + str(G.number_of_nodes())) logger.info('Number of edges: ' + str(G.number_of_edges())) nx.write_gml(G, target_output_gml_path) with open(miscoto_stat_output, 'w') as stats_output: statswriter = csv.writer(stats_output, delimiter="\t") statswriter.writerow(['categories', 'nb_target', 'size_min_sol', 'size_union', 'size_intersection', 'size_enum']) for miscoto_stat_output_data in miscoto_stat_output_datas: statswriter.writerow(miscoto_stat_output_data) with open(key_species_json, 'w') as json_output: json.dump(key_species_data, json_output, indent=4) with open(key_species_stats_output, 'w') as key_stat_output: key_stats_writer = csv.writer(key_stat_output, delimiter='\t') if all_taxons: key_stats_writer.writerow(['target_categories', 'key_group', *sorted(all_taxons), 'Sum']) else: key_stats_writer.writerow(['target_categories', 'key_group', 'data', 'Sum']) for target in key_species_data: if all_taxons: essential_counts = [len(key_species_data[target]['essential_symbionts'][taxon]) for taxon in sorted(all_taxons)] alternative_counts = [len(key_species_data[target]['alternative_symbionts'][taxon]) for taxon in sorted(all_taxons)] else: essential_counts = [len(key_species_data[target]['essential_symbionts']['data'])] alternative_counts = [len(key_species_data[target]['alternative_symbionts']['data'])] key_counts = list(map(add, essential_counts, alternative_counts)) key_stats_writer.writerow([target, 'key_species', *key_counts, sum(key_counts)]) key_stats_writer.writerow([target, 'essential_symbionts', *essential_counts, sum(essential_counts)]) key_stats_writer.writerow([target, 'alternative_symbionts', *alternative_counts, sum(alternative_counts)])