-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create separate communities workflow * Add test for new workflow * Rename workflows * Collapse subflows into parents * Rename flows, reuse variables * Semver * Fix integration test * Fix smoke tests * Fix megapipeline format * Rename missed files --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
- Loading branch information
1 parent
de12521
commit 1d68af3
Showing
36 changed files
with
783 additions
and
735 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"type": "patch", | ||
"description": "Create separate community workflow, collapse subflows." | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License | ||
|
||
"""All the steps to create the base entity graph.""" | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from graphrag.index.operations.cluster_graph import cluster_graph | ||
from graphrag.index.operations.create_graph import create_graph | ||
from graphrag.index.operations.snapshot import snapshot | ||
from graphrag.storage.pipeline_storage import PipelineStorage | ||
|
||
|
||
async def compute_communities( | ||
base_relationship_edges: pd.DataFrame, | ||
storage: PipelineStorage, | ||
clustering_strategy: dict[str, Any], | ||
snapshot_transient_enabled: bool = False, | ||
) -> pd.DataFrame: | ||
"""All the steps to create the base entity graph.""" | ||
graph = create_graph(base_relationship_edges) | ||
|
||
communities = cluster_graph( | ||
graph, | ||
strategy=clustering_strategy, | ||
) | ||
|
||
base_communities = pd.DataFrame( | ||
communities, columns=pd.Index(["level", "community", "parent", "title"]) | ||
).explode("title") | ||
base_communities["community"] = base_communities["community"].astype(int) | ||
|
||
if snapshot_transient_enabled: | ||
await snapshot( | ||
base_communities, | ||
name="base_communities", | ||
storage=storage, | ||
formats=["parquet"], | ||
) | ||
|
||
return base_communities |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License | ||
|
||
"""A module containing build_steps method definition.""" | ||
|
||
from typing import Any, cast | ||
|
||
import pandas as pd | ||
from datashaper import ( | ||
Table, | ||
verb, | ||
) | ||
from datashaper.table_store.types import VerbResult, create_verb_result | ||
|
||
from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep | ||
from graphrag.index.flows.compute_communities import compute_communities | ||
from graphrag.storage.pipeline_storage import PipelineStorage | ||
|
||
workflow_name = "compute_communities" | ||
|
||
|
||
def build_steps( | ||
config: PipelineWorkflowConfig, | ||
) -> list[PipelineWorkflowStep]: | ||
""" | ||
Create the base communities from the graph edges. | ||
## Dependencies | ||
* `workflow:extract_graph` | ||
""" | ||
clustering_config = config.get( | ||
"cluster_graph", | ||
{"strategy": {"type": "leiden"}}, | ||
) | ||
clustering_strategy = clustering_config.get("strategy") | ||
|
||
snapshot_transient = config.get("snapshot_transient", False) or False | ||
|
||
return [ | ||
{ | ||
"verb": workflow_name, | ||
"args": { | ||
"clustering_strategy": clustering_strategy, | ||
"snapshot_transient_enabled": snapshot_transient, | ||
}, | ||
"input": ({"source": "workflow:extract_graph"}), | ||
}, | ||
] | ||
|
||
|
||
@verb( | ||
name=workflow_name, | ||
treats_input_tables_as_immutable=True, | ||
) | ||
async def workflow( | ||
storage: PipelineStorage, | ||
runtime_storage: PipelineStorage, | ||
clustering_strategy: dict[str, Any], | ||
snapshot_transient_enabled: bool = False, | ||
**_kwargs: dict, | ||
) -> VerbResult: | ||
"""All the steps to create the base entity graph.""" | ||
base_relationship_edges = await runtime_storage.get("base_relationship_edges") | ||
|
||
base_communities = await compute_communities( | ||
base_relationship_edges, | ||
storage, | ||
clustering_strategy=clustering_strategy, | ||
snapshot_transient_enabled=snapshot_transient_enabled, | ||
) | ||
|
||
await runtime_storage.set("base_communities", base_communities) | ||
|
||
return create_verb_result(cast("Table", pd.DataFrame())) |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.