-
Notifications
You must be signed in to change notification settings - Fork 29
refactor: separate statistic computation #411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6ec4f62
9987189
25eef5e
cb373c1
47a9e26
898d568
c675ece
3c81d05
1ca730e
d67186d
fd483c3
fd5046f
79cf748
339d915
85e0ea8
804849a
cf3c6a0
0f7acca
ae61e57
b038ecf
4fe949d
a86354f
9053a59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,11 +2,12 @@ import os | |
| from spras import runner | ||
| import shutil | ||
| import yaml | ||
| from spras.dataset import Dataset | ||
| from spras.evaluation import Evaluation | ||
| from spras.analysis import ml, summary, cytoscape | ||
| from spras.config.revision import detach_spras_revision | ||
| import spras.config.config as _config | ||
| from spras.dataset import Dataset | ||
| from spras.evaluation import Evaluation | ||
| from spras.statistics import from_output_pathway, statistics_computation, statistics_options | ||
|
|
||
| # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 | ||
| # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them | ||
|
|
@@ -312,18 +313,48 @@ rule viz_cytoscape: | |
| run: | ||
| cytoscape.run_cytoscape(input.pathways, output.session, container_settings) | ||
|
|
||
| # We generate new Snakemake rules for every statistic | ||
| # to allow parallel and lazy computation of individual statistics | ||
| for keys in statistics_computation.keys(): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of the complicated typing, I find it hard to track what |
||
| pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys]) | ||
| rule: | ||
| # (See https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#procedural-rule-definition) | ||
| name: pythonic_name | ||
|
tristan-f-r marked this conversation as resolved.
|
||
| input: pathway_file = rules.parse_output.output.standardized_file | ||
|
tristan-f-r marked this conversation as resolved.
|
||
| output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to sanitize the keys here as done in the pythonic_name above? When I run locally, I see output files like |
||
| # It is very tempting to use `.items()` instead of `.keys()` above, but | ||
|
tristan-f-r marked this conversation as resolved.
|
||
| # We instead need to pass keys in via parameters, else the job would use the latest values in the statistics_computation. | ||
| # More info is in the procedural rule link above | ||
| params: statistics_names=keys | ||
| run: | ||
| (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True) | ||
| graph = from_output_pathway(input.pathway_file) | ||
| for computed, output in zip(statistics_computation[params.statistics_names](graph), output): | ||
| Path(output).write_text(str(computed)) | ||
|
|
||
| # We isolate this to a separate input function, as we want to preserve the dictionary structure | ||
| def summary_files(wildcards): | ||
| return { | ||
| algorithm_param: expand( | ||
| '{out_dir}{sep}{dataset}-{algorithm_param}{sep}statistics{sep}{statistic}.txt', | ||
| out_dir=out_dir, sep=SEP, algorithm_param=algorithm_param, statistic=statistics_options, | ||
| dataset=wildcards.dataset | ||
| ) for algorithm_param in algorithms_with_params | ||
| } | ||
|
|
||
| # Write a single summary table for all pathways for each dataset | ||
| rule summary_table: | ||
| input: | ||
| # Collect all pathways generated for the dataset | ||
| pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params), | ||
| dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) | ||
| dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']), | ||
| # Collect all possible statistics from the `summary_files` dictionary-based input function | ||
| statistics = lambda wildcards: flatten(list(summary_files(wildcards).values())) | ||
| output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt']) | ||
| run: | ||
| # Load the node table from the pickled dataset file | ||
| node_table = Dataset.from_file(input.dataset_file).node_table | ||
| summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params) | ||
| summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, summary_files(wildcards)) | ||
| summary_df.to_csv(output.summary_table, sep='\t', index=False) | ||
|
|
||
| # Cluster the output pathways for each dataset | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,17 @@ | ||
| import ast | ||
| import itertools | ||
| import json | ||
| import os | ||
| from pathlib import Path | ||
| from statistics import median | ||
| from typing import Iterable | ||
| from typing import Iterable, Mapping | ||
|
|
||
| import networkx as nx | ||
| import pandas as pd | ||
|
|
||
| from spras.statistics import from_output_pathway, statistics_options | ||
|
|
||
|
|
||
| def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict], | ||
| algo_with_params: list[str]) -> pd.DataFrame: | ||
| algo_with_params: list[str], statistics_files: Mapping[str, Iterable[str | os.PathLike]]) -> pd.DataFrame: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We now have |
||
| """ | ||
| Generate a table that aggregates summary information about networks in file_paths, including which nodes are present | ||
| in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the | ||
|
|
@@ -18,6 +21,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
| @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter | ||
| combinations. | ||
| @param algo_with_params: a list of <algorithm>-params-<params_hash> combinations | ||
| @param statistics_files: a dictionary from algo_with_params to lists of statistic files with the computed statistics. | ||
| @return: pandas DataFrame with summary information | ||
| """ | ||
| # Ensure that NODEID is the first column | ||
|
|
@@ -40,52 +44,22 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
|
|
||
| # Iterate through each network file path | ||
| for index, file_path in enumerate(sorted(file_paths)): | ||
| with open(file_path, 'r') as f: | ||
| lines = f.readlines()[1:] # skip the header line | ||
|
|
||
| # directed or mixed graphs are parsed and summarized as an undirected graph | ||
| nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) | ||
| nw = from_output_pathway(file_path) | ||
|
|
||
| # Save the network name, number of nodes, number edges, and number of connected components | ||
| nw_name = str(file_path) | ||
| number_nodes = nw.number_of_nodes() | ||
| number_edges = nw.number_of_edges() | ||
| ncc = nx.number_connected_components(nw) | ||
|
|
||
| # Save the max/median degree, average clustering coefficient, and density | ||
| if number_nodes == 0: | ||
| max_degree = 0 | ||
| median_degree = 0.0 | ||
| density = 0.0 | ||
| else: | ||
| degrees = [deg for _, deg in nw.degree()] | ||
| max_degree = max(degrees) | ||
| median_degree = median(degrees) | ||
| density = nx.density(nw) | ||
|
|
||
| cc = list(nx.connected_components(nw)) | ||
| # Save the max diameter | ||
| # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) | ||
| diameters = [ | ||
| nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0 | ||
| for c in cc | ||
| ] | ||
| max_diameter = max(diameters, default=0) | ||
|
|
||
| # Save the average path lengths | ||
| # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) | ||
| avg_path_lengths = [ | ||
| nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0 | ||
| for c in cc | ||
| # We use ast.literal_eval here to convert statistic file outputs to ints or floats depending on their string representation. | ||
| # (e.g. "5.0" -> float(5.0), while "5" -> int(5).) | ||
| graph_statistics = [ | ||
| ast.literal_eval(Path(file).read_text()) for file in | ||
| # along with sorting to keep the output stable (we do this same sorting procedure once more in this function) | ||
| sorted(statistics_files[algo_with_params[index]], key=lambda x: statistics_options.index(Path(x).stem)) | ||
| ] | ||
|
|
||
| if len(avg_path_lengths) != 0: | ||
| avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) | ||
| else: | ||
| avg_path_len = 0.0 | ||
|
|
||
| # Initialize list to store current network information | ||
| cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len] | ||
| cur_nw_info = [nw_name, *graph_statistics] | ||
|
|
||
| # Iterate through each node property and save the intersection with the current network | ||
| for node_list in nodes_by_col: | ||
|
|
@@ -107,8 +81,13 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
| # Save the current network information to the network summary list | ||
| nw_info.append(cur_nw_info) | ||
|
|
||
| # Get the list of statistic names by their file names (via finding all requested statistics in the provided files) | ||
| current_statistics_options = sorted( | ||
| set(Path(file).stem for file in itertools.chain(*statistics_files.values())), | ||
| key=lambda x: statistics_options.index(x) | ||
| ) | ||
| # Prepare column names | ||
| col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length'] | ||
| col_names = ['Name', *current_statistics_options] | ||
| col_names.extend(nodes_by_col_labs) | ||
| col_names.append('Parameter combination') | ||
|
|
||
|
|
@@ -120,67 +99,3 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
| ) | ||
|
|
||
| return nw_info | ||
|
|
||
|
|
||
| def degree(g): | ||
| return dict(g.degree) | ||
|
|
||
| # TODO: redo .run code to work on mixed graphs | ||
|
tristan-f-r marked this conversation as resolved.
|
||
| # stats is just a list of functions to apply to the graph. | ||
| # They should take as input a networkx graph or digraph but may have any output. | ||
| # stats = [degree, nx.clustering, nx.betweenness_centrality] | ||
|
|
||
|
|
||
| # def produce_statistics(g: nx.Graph, s=None) -> dict: | ||
| # global stats | ||
| # if s is not None: | ||
| # stats = s | ||
| # d = dict() | ||
| # for s in stats: | ||
| # sname = s.__name__ | ||
| # d[sname] = s(g) | ||
| # return d | ||
|
|
||
|
|
||
| # def load_graph(path: str) -> nx.Graph: | ||
| # g = nx.read_edgelist(path, data=(('weight', float), ('Direction',str))) | ||
| # return g | ||
|
|
||
|
|
||
| # def save(data, pth): | ||
| # fout = open(pth, 'w') | ||
| # fout.write('#node\t%s\n' % '\t'.join([s.__name__ for s in stats])) | ||
| # for node in data[stats[0].__name__]: | ||
| # row = [data[s.__name__][node] for s in stats] | ||
| # fout.write('%s\t%s\n' % (node, '\t'.join([str(d) for d in row]))) | ||
| # fout.close() | ||
|
|
||
|
|
||
| # def run(infile: str, outfile: str) -> None: | ||
| # """ | ||
| # run function that wraps above functions. | ||
| # """ | ||
| # # if output directory doesn't exist, make it. | ||
| # outdir = os.path.dirname(outfile) | ||
| # if not os.path.exists(outdir): | ||
| # os.makedirs(outdir) | ||
|
|
||
| # # load graph, produce stats, and write to human-readable file. | ||
| # g = load_graph(infile) | ||
| # dat = produce_statistics(g) | ||
| # save(dat, outfile) | ||
|
|
||
|
|
||
| # def main(argv): | ||
| # """ | ||
| # for testing | ||
| # """ | ||
| # g = load_graph(argv[1]) | ||
| # print(g.nodes) | ||
| # dat = produce_statistics(g) | ||
| # print(dat) | ||
| # save(dat, argv[2]) | ||
|
|
||
|
|
||
| # if __name__ == '__main__': | ||
| # main(sys.argv) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| """ | ||
| Graph statistics, used to power summary.py. | ||
|
tristan-f-r marked this conversation as resolved.
|
||
|
|
||
| We allow for arbitrary computation of any specific statistic on some graph, | ||
| computing more than necessary if we have dependencies. See the top level | ||
| `statistics_computation` dictionary for usage. | ||
|
|
||
| To make the statistics allow directed graph input, they will always take | ||
| in a networkx.DiGraph, which contains even more information, even though | ||
| the underlying graph may be just as easily represented by networkx.Graph. | ||
|
Comment on lines
+8
to
+10
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a change in functionality? In summary.py we had the opposite
When we previously assessed treating all graphs as directed or undirected, we decided that undirected would be less wrong. |
||
| """ | ||
|
|
||
| import itertools | ||
| from statistics import median | ||
| from typing import Callable | ||
|
|
||
| import networkx as nx | ||
|
|
||
|
|
||
| def compute_degree(graph: nx.DiGraph) -> tuple[int, float]: | ||
| """ | ||
| Computes the (max, median) degree of a `graph`. | ||
| """ | ||
| # number_of_nodes is a cheap call | ||
| if graph.number_of_nodes() == 0: | ||
| return (0, 0.0) | ||
| else: | ||
| degrees = [deg for _, deg in graph.degree()] | ||
| return max(degrees), median(degrees) | ||
|
|
||
| def compute_max_diameter(directed_graph: nx.DiGraph) -> tuple[int,]: | ||
| # We convert our directed_graph to an undirected graph as networkx (reasonably) does | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't remember why we do this. @agitter I remember we talked about this years ago.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [I'm a little confused - is it why we compute the number of connected components in the first place?]
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's about the undirected graph conversion, that comment should be why.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ntalluri was your comment asking why we convert to undirected for the connected component calculation? Related to my comment above, I recommend we stay with reading undirected graphs and use them throughout. That affects other statistics like degree as well. |
||
| # not allow for computing the connected components of a directed graph, but the connected | ||
| # component count still is a useful statistic for us. | ||
| # We do this a few more times throughout the file. | ||
| graph: nx.Graph = directed_graph.to_undirected() | ||
| cc = list(nx.connected_components(graph)) | ||
| # Save the max diameter | ||
| # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) | ||
| diameters = [ | ||
| nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0 | ||
| for c in cc | ||
| ] | ||
| return (max(diameters, default=0),) | ||
|
|
||
| def compute_avg_path_lengths(directed_graph: nx.DiGraph) -> tuple[float,]: | ||
| graph: nx.Graph = directed_graph.to_undirected() | ||
| cc = list(nx.connected_components(graph)) | ||
| # Save the average path lengths | ||
| # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) | ||
| avg_path_lengths = [ | ||
| nx.average_shortest_path_length(directed_graph.subgraph(c).copy()) if len(c) > 1 else 0.0 | ||
| for c in cc | ||
| ] | ||
|
|
||
| if len(avg_path_lengths) != 0: | ||
| avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) | ||
| else: | ||
| avg_path_len = 0.0 | ||
|
|
||
| return (avg_path_len,) | ||
|
|
||
| # The type signature here is meant to be 'an n-tuple has n outputs.' | ||
| statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = { | ||
|
tristan-f-r marked this conversation as resolved.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This syntax is somewhat confusing. It took me a minute to realize why everything needs to be a tuple, that sometimes we have a function return one statistic and sometimes multiple. The design makes sense now that I get it, but a comment would have helped. |
||
| ('Number of nodes',): lambda graph : (graph.number_of_nodes(),), | ||
| ('Number of edges',): lambda graph : (graph.number_of_edges(),), | ||
| ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),), | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, having to convert everything to undirected is messy. |
||
| ('Density',): lambda graph : (nx.density(graph),), | ||
| ('Max degree', 'Median degree'): compute_degree, | ||
| ('Max diameter',): compute_max_diameter, | ||
| ('Average path length',): compute_avg_path_lengths, | ||
| } | ||
|
|
||
| # All of the keys inside statistics_computation, flattened. | ||
| statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys()))) | ||
|
|
||
| def from_output_pathway(lines) -> nx.Graph: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I saw it in summary.py, I didn't recognize this function as a graph loader from the name. |
||
| with open(lines, 'r') as f: | ||
| next(f) # skip the header line | ||
| return nx.read_edgelist(f, data=(('Rank', int), ('Direction', str)), delimiter='\t') | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,4 @@ | ||
| import filecmp | ||
| import shutil | ||
| import subprocess | ||
| from pathlib import Path | ||
|
|
||
|
|
@@ -32,7 +31,7 @@ def snakemake_output(request): | |
| param = request.param | ||
| subprocess.run(["snakemake", "--cores", "1", "--configfile", f"test/analysis/input/{param}.yaml"]) | ||
| yield param # this runs the test itself: once this is passed, we go to test cleanup. | ||
| shutil.rmtree(f"test/analysis/input/run/{param}") | ||
| # shutil.rmtree(f"test/analysis/input/run/{param}") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we not need this anymore? |
||
|
|
||
| class TestSummary: | ||
| @classmethod | ||
|
|
@@ -56,12 +55,15 @@ def test_example_networks(self, snakemake_output): | |
| algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in | ||
| algorithm_params.items() for params_hash in param_combos.keys()] | ||
|
|
||
| example_network_files = (INPUT_DIR / "run" / snakemake_output).rglob("pathway.txt") | ||
| network_files = (INPUT_DIR / "run" / snakemake_output).rglob("pathway.txt") | ||
| statistics_folders = [Path(file) for file in (INPUT_DIR / "run" / snakemake_output).rglob("**/statistics") if Path(file).name == "statistics"] | ||
| # We do some string fiddling here to make sure the folder matches up with algorithms_with_params. This may be susceptible to a good refactor. | ||
| statistics_files = {"-".join(folder.parent.stem.split("-")[1:]): list(folder.glob("*.txt")) for folder in statistics_folders} | ||
|
|
||
| out_path = Path(OUT_DIR, f"test_{snakemake_output}_summary.txt") | ||
| out_path.unlink(missing_ok=True) | ||
| summarize_out = summarize_networks(example_network_files, example_node_table, algorithm_params, | ||
| algorithms_with_params) | ||
| summarize_out = summarize_networks(network_files, example_node_table, algorithm_params, | ||
| algorithms_with_params, statistics_files) | ||
| # We do some post-processing to ensure that we get a stable summarize_out, since the attached hash | ||
| # is subject to variation (especially in testing) whenever the SPRAS commit revision gets changed | ||
| summarize_out["Parameter combination"] = summarize_out["Parameter combination"].astype(str) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.