Skip to content

Commit

Permalink
v0.14 (#977)
Browse files Browse the repository at this point in the history
* Add representation models
  * bertopic.representation.KeyBERTInspired
  * bertopic.representation.PartOfSpeech
  * bertopic.representation.MaximalMarginalRelevance
  * bertopic.representation.Cohere
  * bertopic.representation.OpenAI
  * bertopic.representation.TextGeneration
  * bertopic.representation.LangChain
  * bertopic.representation.ZeroShotClassification
* Fix topic selection when extracting repr docs
* Improve documentation, #769, #954, #912
* Add wordcloud example to documentation
* Add title param for each graph, #800
* Improved nr_topics procedure
* Fix #952, #903, #911, #965. Add #976
  • Loading branch information
MaartenGr authored Feb 14, 2023
1 parent 04dccbe commit 7142ce7
Show file tree
Hide file tree
Showing 70 changed files with 2,993 additions and 861 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8]
python-version: [3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion bertopic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from bertopic._bertopic import BERTopic

__version__ = "0.13.0"
__version__ = "0.14.0"

__all__ = [
"BERTopic",
Expand Down
325 changes: 154 additions & 171 deletions bertopic/_bertopic.py

Large diffs are not rendered by default.

53 changes: 0 additions & 53 deletions bertopic/_mmr.py

This file was deleted.

9 changes: 6 additions & 3 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,15 @@ class NotInstalled:
installed in order to use the string matching model.
"""

def __init__(self, tool, dep):
def __init__(self, tool, dep, custom_msg=None):
self.tool = tool
self.dep = dep

msg = f"In order to use {self.tool} you'll need to install via;\n\n"
msg += f"pip install bertopic[{self.dep}]\n\n"
msg = f"In order to use {self.tool} you will need to install via;\n\n"
if custom_msg is not None:
msg += custom_msg
else:
msg += f"pip install bertopic[{self.dep}]\n\n"
self.msg = msg

def __getattr__(self, *args, **kwargs):
Expand Down
4 changes: 2 additions & 2 deletions bertopic/plotting/_barchart.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def visualize_barchart(topic_model,
top_n_topics: int = 8,
n_words: int = 5,
custom_labels: bool = False,
title: str = "Topic Word Scores",
title: str = "<b>Topic Word Scores</b>",
width: int = 250,
height: int = 250) -> go.Figure:
""" Visualize a barchart of selected topics
Expand Down Expand Up @@ -99,7 +99,7 @@ def visualize_barchart(topic_model,
template="plotly_white",
showlegend=False,
title={
'text': f"<b>{title}",
'text': f"{title}",
'x': .5,
'xanchor': 'center',
'yanchor': 'top',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ def visualize_distribution(topic_model,
probabilities: np.ndarray,
min_probability: float = 0.015,
custom_labels: bool = False,
title: str = "<b>Topic Probability Distribution</b>",
width: int = 800,
height: int = 600) -> go.Figure:
""" Visualize the distribution of topic probabilities
Expand All @@ -17,6 +18,7 @@ def visualize_distribution(topic_model,
All others are ignored.
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -80,7 +82,7 @@ def visualize_distribution(topic_model,
fig.update_layout(
xaxis_title="Probability",
title={
'text': "<b>Topic Probability Distribution",
'text': f"{title}",
'y': .95,
'x': 0.5,
'xanchor': 'center',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def visualize_documents(topic_model,
hide_annotations: bool = False,
hide_document_hover: bool = False,
custom_labels: bool = False,
title: str = "<b>Documents and Topics</b>",
width: int = 1200,
height: int = 750):
""" Visualize documents and their topics in 2D
Expand All @@ -37,6 +38,7 @@ def visualize_documents(topic_model,
specific points. Helps to speed up generation of visualization.
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -203,7 +205,7 @@ def visualize_documents(topic_model,
fig.update_layout(
template="simple_white",
title={
'text': "<b>Documents and Topics",
'text': f"{title}",
'x': 0.5,
'xanchor': 'center',
'yanchor': 'top',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def visualize_heatmap(topic_model,
top_n_topics: int = None,
n_clusters: int = None,
custom_labels: bool = False,
title: str = "<b>Similarity Matrix</b>",
width: int = 800,
height: int = 800) -> go.Figure:
""" Visualize a heatmap of the topic's similarity matrix
Expand All @@ -27,6 +28,7 @@ def visualize_heatmap(topic_model,
matrix by those clusters.
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -108,7 +110,7 @@ def visualize_heatmap(topic_model,

fig.update_layout(
title={
'text': "<b>Similarity Matrix",
'text': f"{title}",
'y': .95,
'x': 0.55,
'xanchor': 'center',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_hierarchical_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def visualize_hierarchical_documents(topic_model,
hide_document_hover: bool = True,
nr_levels: int = 10,
custom_labels: bool = False,
title: str = "<b>Hierarchical Documents and Topics</b>",
width: int = 1200,
height: int = 750) -> go.Figure:
""" Visualize documents and their topics in 2D at different levels of hierarchy
Expand Down Expand Up @@ -48,6 +49,7 @@ def visualize_hierarchical_documents(topic_model,
`topic_model.set_topic_labels`.
NOTE: Custom labels are only generated for the original
un-merged topics.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -300,7 +302,7 @@ def visualize_hierarchical_documents(topic_model,
sliders=sliders,
template="simple_white",
title={
'text': "<b>Hierarchical Documents and Topics",
'text': f"{title}",
'x': 0.5,
'xanchor': 'center',
'yanchor': 'top',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def visualize_hierarchy(topic_model,
topics: List[int] = None,
top_n_topics: int = None,
custom_labels: bool = False,
title: str = "<b>Hierarchical Clustering</b>",
width: int = 1000,
height: int = 600,
hierarchical_topics: pd.DataFrame = None,
Expand All @@ -37,6 +38,7 @@ def visualize_hierarchy(topic_model,
`topic_model.set_topic_labels`.
NOTE: Custom labels are only generated for the original
un-merged topics.
title: Title of the plot.
width: The width of the figure. Only works if orientation is set to 'left'
height: The height of the figure. Only works if orientation is set to 'bottom'
hierarchical_topics: A dataframe that contains a hierarchy of topics
Expand Down Expand Up @@ -143,7 +145,7 @@ def visualize_hierarchy(topic_model,
plot_bgcolor='#ECEFF1',
template="plotly_white",
title={
'text': "<b>Hierarchical Clustering",
'text': f"{title}",
'x': 0.5,
'xanchor': 'center',
'yanchor': 'top',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_term_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def visualize_term_rank(topic_model,
topics: List[int] = None,
log_scale: bool = False,
custom_labels: bool = False,
title: str = "<b>Term score decline per Topic</b>",
width: int = 800,
height: int = 500) -> go.Figure:
""" Visualize the ranks of all terms across all topics
Expand All @@ -23,6 +24,7 @@ def visualize_term_rank(topic_model,
log_scale: Whether to represent the ranking on a log scale
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -103,7 +105,7 @@ def visualize_term_rank(topic_model,
showlegend=False,
template="plotly_white",
title={
'text': "<b>Term score decline per Topic</b>",
'text': f"{title}",
'y': .9,
'x': 0.5,
'xanchor': 'center',
Expand Down
19 changes: 14 additions & 5 deletions bertopic/plotting/_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
def visualize_topics(topic_model,
topics: List[int] = None,
top_n_topics: int = None,
custom_labels: bool = False,
title: str = "<b>Intertopic Distance Map</b>",
width: int = 650,
height: int = 650) -> go.Figure:
""" Visualize topics, their sizes, and their corresponding words
Expand All @@ -22,6 +24,9 @@ def visualize_topics(topic_model,
topic_model: A fitted BERTopic instance.
topics: A selection of topics to visualize
top_n_topics: Only select the top n most frequent topics
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -55,23 +60,27 @@ def visualize_topics(topic_model,
# Extract topic words and their frequencies
topic_list = sorted(topics)
frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list]
words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]
if custom_labels and topic_model.custom_labels_ is not None:
words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list]
else:
words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]

# Embed c-TF-IDF into 2D
all_topics = sorted(list(topic_model.get_topics().keys()))
indices = np.array([all_topics.index(topic) for topic in topics])
embeddings = topic_model.c_tf_idf_.toarray()[indices]
embeddings = MinMaxScaler().fit_transform(embeddings)
embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger').fit_transform(embeddings)
embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)

# Visualize with plotly
df = pd.DataFrame({"x": embeddings[:, 0], "y": embeddings[:, 1],
"Topic": topic_list, "Words": words, "Size": frequencies})
return _plotly_topic_visualization(df, topic_list, width, height)
return _plotly_topic_visualization(df, topic_list, title, width, height)


def _plotly_topic_visualization(df: pd.DataFrame,
topic_list: List[str],
title: str,
width: int,
height: int):
""" Create plotly-based visualization of topics with a slider for topic selection """
Expand All @@ -94,7 +103,7 @@ def get_color(topic_selected):

# Update hover order
fig.update_traces(hovertemplate="<br>".join(["<b>Topic %{customdata[0]}</b>",
"Words: %{customdata[1]}",
"%{customdata[1]}",
"Size: %{customdata[2]}"]))

# Create a slider for topic selection
Expand All @@ -104,7 +113,7 @@ def get_color(topic_selected):
# Stylize layout
fig.update_layout(
title={
'text': "<b>Intertopic Distance Map",
'text': f"{title}",
'y': .95,
'x': 0.5,
'xanchor': 'center',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_topics_over_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def visualize_topics_over_time(topic_model,
topics: List[int] = None,
normalize_frequency: bool = False,
custom_labels: bool = False,
title: str = "<b>Topics over Time</b>",
width: int = 1250,
height: int = 450) -> go.Figure:
""" Visualize topics over time
Expand All @@ -23,6 +24,7 @@ def visualize_topics_over_time(topic_model,
normalize_frequency: Whether to normalize each topic's frequency individually
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -91,7 +93,7 @@ def visualize_topics_over_time(topic_model,
fig.update_layout(
yaxis_title="Normalized Frequency" if normalize_frequency else "Frequency",
title={
'text': "<b>Topics over Time",
'text': f"{title}",
'y': .95,
'x': 0.40,
'xanchor': 'center',
Expand Down
4 changes: 3 additions & 1 deletion bertopic/plotting/_topics_per_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def visualize_topics_per_class(topic_model,
topics: List[int] = None,
normalize_frequency: bool = False,
custom_labels: bool = False,
title: str = "<b>Topics per Class</b>",
width: int = 1250,
height: int = 900) -> go.Figure:
""" Visualize topics per class
Expand All @@ -23,6 +24,7 @@ def visualize_topics_per_class(topic_model,
normalize_frequency: Whether to normalize each topic's frequency individually
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -98,7 +100,7 @@ def visualize_topics_per_class(topic_model,
xaxis_title="Normalized Frequency" if normalize_frequency else "Frequency",
yaxis_title="Class",
title={
'text': "<b>Topics per Class",
'text': f"{title}",
'y': .95,
'x': 0.40,
'xanchor': 'center',
Expand Down
Loading

0 comments on commit 7142ce7

Please sign in to comment.