Source code for metage2metabo.m2m_analysis.solution_graph

# Copyright (C) 2019-2024 Clémence Frioux & Arnaud Belcour - Inria Dyliss - Pleiade - Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>

import csv
import json
import logging
import networkx as nx
import os
import sys
import time

from itertools import combinations
from metage2metabo import utils, sbml_management
from metage2metabo.m2m_analysis.taxonomy import get_taxon, extract_taxa
from operator import add

logger = logging.getLogger(__name__)



[docs]
def graph_analysis(json_file_folder, target_folder_file, output_dir, taxon_file=None, taxonomy_level="phylum"):
    """Run the graph creation on miscoto output

    Args:
        json_file_folder (str): json file or folder containing multiple jsons
        target_folder_file (str): targets file or folder containing multiple sbmls
        output_dir (str): results directory
        taxon_file (str): mpwt taxon file for species in sbml folder
        taxonomy_level (str): taxonomy level, must be: phylum, class, order, family, genus or species.

    Returns:
        str: path to folder containing gml results
    """
    starttime = time.time()
    logger.info('\n###############################################')
    logger.info('#                                             #')
    logger.info('#         Solution graph creation             #')
    logger.info('#                                             #')
    logger.info('###############################################\n')

    target_paths = utils.file_or_folder(target_folder_file)
    json_paths = utils.file_or_folder(json_file_folder)

    gml_output = os.path.join(output_dir, 'gml')

    if taxon_file:
        taxonomy_output_file = os.path.join(output_dir, 'taxonomy_species.tsv')
        tree_output_file = os.path.join(output_dir, 'taxon_tree.txt')
        extract_taxa(taxon_file, taxonomy_output_file, tree_output_file, taxonomy_level)
    else:
        taxonomy_output_file = None

    create_gml(json_paths, target_paths, output_dir, taxonomy_output_file)

    logger.info(
        "--- Graph runtime %.2f seconds ---\n" % (time.time() - starttime))

    return gml_output




[docs]
def create_gml(json_paths, target_paths, output_dir, taxon_file=None):
    """Create solution graph from miscoto output and compute stats

    Args:
        json_paths (str): {target: path_to_corresponding_json}
        target_paths (str): {target: path_to_corresponding_sbml}
        output_dir (str): results directory
        taxon_file (str): mpwt taxon file for species in sbml folder
    """
    miscoto_stat_output = os.path.join(output_dir, 'miscoto_stats.txt')
    key_species_stats_output = os.path.join(output_dir,'key_species_stats.tsv')
    key_species_json = os.path.join(output_dir, 'key_species.json')

    gml_output = os.path.join(output_dir, 'gml')

    if not utils.is_valid_dir(gml_output):
        logger.critical('Impossible to access/create output directory')
        sys.exit(1)

    len_min_sol = {}
    len_union = {}
    len_intersection = {}
    len_solution = {}
    len_target = {}

    target_categories = {}
    for target in target_paths:
        target_categories[target] = sbml_management.get_compounds(target_paths[target])

    if taxon_file:
        taxon_named_species, all_taxons = get_taxon(taxon_file)
    else:
        taxon_named_species = None
        all_taxons = None

    key_species_data = {}
    miscoto_stat_output_datas = []

    for target_category in target_categories:
        key_species_data[target_category] = {}
        key_species_data[target_category]['essential_symbionts'] = {}
        key_species_data[target_category]['alternative_symbionts'] = {}
        target_output_gml_path = os.path.join(gml_output, target_category + '.gml')
        with open(json_paths[target_category]) as json_data:
            dicti = json.load(json_data)

        G = nx.Graph()
        added_node = []
        species_weight = {}
        if dicti['still_unprod'] != []:
            logger.error('ERROR ', dicti["still_unprod"], ' is unproducible')
            logger.error('ERROR: Please remove these unproducible targets ({0}) from the targets file, delete output folder ({1}) re-run m2m_analysis'.format(','.join(dicti["still_unprod"]), output_dir))
            sys.exit(1)

        len_target[target_category] = len(dicti['newly_prod']) + len(dicti['still_unprod'])
        len_min_sol[target_category] = len(dicti['bacteria'])
        len_union[target_category] = len(dicti['union_bacteria'])
        len_intersection[target_category] = len(dicti['inter_bacteria'])
        key_species_types = {organism:'ES' if organism in dicti['inter_bacteria'] else 'AS' for organism in dicti['union_bacteria']}
        if taxon_file:
            for taxon in all_taxons:
                key_species_data[target_category]['essential_symbionts'][taxon] = [organism for organism in key_species_types if key_species_types[organism] == 'ES' and taxon_named_species[organism].split('__')[0] == taxon]
                key_species_data[target_category]['alternative_symbionts'][taxon] = [organism for organism in key_species_types if key_species_types[organism] == 'AS' and taxon_named_species[organism].split('__')[0] == taxon]
        else:
            key_species_data[target_category]['essential_symbionts']['data'] = [organism for organism in key_species_types if key_species_types[organism] == 'ES']
            key_species_data[target_category]['alternative_symbionts']['data'] = [organism for organism in key_species_types if key_species_types[organism] == 'AS']

        len_solution[target_category] = len(dicti['enum_bacteria'])
        for sol in dicti['enum_bacteria']:
            if len(dicti['enum_bacteria'][sol]) > 1:
                for species_1, species_2 in combinations(dicti['enum_bacteria'][sol], 2):
                    if species_1 not in added_node:
                        if taxon_file:
                            G.add_node(taxon_named_species[species_1], note=key_species_types[species_1])
                        else:
                            G.add_node(species_1, note=key_species_types[species_1])
                        added_node.append(species_1)
                    if species_2 not in added_node:
                        if taxon_file:
                            G.add_node(taxon_named_species[species_2], note=key_species_types[species_2])
                        else:
                            G.add_node(species_2, note=key_species_types[species_2])
                        added_node.append(species_2)
                    combination_species = '_'.join(sorted([species_1, species_2]))
                    if combination_species not in species_weight:
                        species_weight[combination_species] = 1
                    else:
                        species_weight[combination_species] += 1
                    if taxon_file:
                        G.add_edge(taxon_named_species[species_1], taxon_named_species[species_2], weight=species_weight[combination_species])
                    else:
                        G.add_edge(species_1, species_2, weight=species_weight[combination_species])
            elif len(dicti['enum_bacteria'][sol]) == 1:
                species_1 = dicti['enum_bacteria'][sol][0]
                if species_1 not in added_node:
                    if taxon_file:
                        G.add_node(taxon_named_species[species_1], note=key_species_types[species_1])
                    else:
                        G.add_node(species_1, note=key_species_types[species_1])
                    added_node.append(species_1)

        # Check if all the nodes of G are not isolates.
        if len(G.nodes) == nx.number_of_isolates(G):
            logger.critical(r'/!\ Warning: All the nodes of the solution graph are isolated (they are not connected to other nodes). This lead to powergrasp creating an empty powergraph.')
            logger.critical('So m2m_analysis stops at the solution graph step.')
            sys.exit(1)

        miscoto_stat_output_datas.append([target_category, str(len_target[target_category]), str(len_min_sol[target_category]),
                                str(len_union[target_category]), str(len_intersection[target_category]),
                                str(len_solution[target_category])])
        logger.info('######### Graph of ' + target_category + ' #########')
        logger.info('Number of nodes: ' + str(G.number_of_nodes()))
        logger.info('Number of edges: ' + str(G.number_of_edges()))
        nx.write_gml(G, target_output_gml_path)

    with open(miscoto_stat_output, 'w') as stats_output:
        statswriter = csv.writer(stats_output, delimiter="\t")
        statswriter.writerow(['categories', 'nb_target', 'size_min_sol', 'size_union', 'size_intersection', 'size_enum'])
        for miscoto_stat_output_data in miscoto_stat_output_datas:
            statswriter.writerow(miscoto_stat_output_data)

    with open(key_species_json, 'w') as json_output:
        json.dump(key_species_data, json_output, indent=4)

    with open(key_species_stats_output, 'w') as key_stat_output:
        key_stats_writer = csv.writer(key_stat_output, delimiter='\t')
        if all_taxons:
            key_stats_writer.writerow(['target_categories', 'key_group', *sorted(all_taxons), 'Sum'])
        else:
            key_stats_writer.writerow(['target_categories', 'key_group', 'data', 'Sum'])
        for target in key_species_data:
            if all_taxons:
                essential_counts = [len(key_species_data[target]['essential_symbionts'][taxon]) for taxon in sorted(all_taxons)]
                alternative_counts = [len(key_species_data[target]['alternative_symbionts'][taxon]) for taxon in sorted(all_taxons)]
            else:
                essential_counts = [len(key_species_data[target]['essential_symbionts']['data'])]
                alternative_counts = [len(key_species_data[target]['alternative_symbionts']['data'])]
            key_counts = list(map(add, essential_counts, alternative_counts))
            key_stats_writer.writerow([target, 'key_species', *key_counts, sum(key_counts)])
            key_stats_writer.writerow([target, 'essential_symbionts', *essential_counts, sum(essential_counts)])
            key_stats_writer.writerow([target, 'alternative_symbionts', *alternative_counts, sum(alternative_counts)])