Skip to content

Commit

Permalink
Merge pull request #6 from Sydney-Informatics-Hub/jux_visual
Browse files Browse the repository at this point in the history
Changes to work with the Jux visual panel
  • Loading branch information
milysun authored Sep 23, 2024
2 parents 42a6d25 + 2eb60a6 commit 9b31a5a
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 137 deletions.
44 changes: 14 additions & 30 deletions juxtorpus/viz/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
def wordclouds(corpora, names: list[str],
max_words: int = 50,
metric: str = 'tf',
word_type: str = 'word',
stopwords: list[str] = None,
lower: bool = True):
dtm_name: str = 'tokens',
stopwords: list[str] = None):
MAX_COLS = 2
nrows = math.ceil(len(names) / 2)
fig, axes = plt.subplots(nrows=nrows, ncols=MAX_COLS, figsize=(16, 16 * 1.5))
Expand All @@ -26,9 +25,9 @@ def wordclouds(corpora, names: list[str],
wc = _wordcloud(corpus,
max_words=max_words,
metric=metric,
word_type=word_type,
dtm_name=dtm_name,
stopwords=stopwords,
lower=lower)
return_wc=False)
if nrows == 1:
ax = axes[c]
else:
Expand All @@ -42,9 +41,11 @@ def wordclouds(corpora, names: list[str],
plt.show()


def wordcloud(corpus, metric: str = 'tf', max_words: int = 50, word_type: str = 'word',
stopwords: list[str] = None, lower: bool = True):
wc = _wordcloud(corpus, max_words, metric, word_type, stopwords, lower)
def wordcloud(corpus, metric: str = 'tf', max_words: int = 50, dtm_name: str = 'tokens',
stopwords: list[str] = None, return_wc: bool = False):
wc = _wordcloud(corpus, max_words, metric, dtm_name, stopwords)
if return_wc:
return wc
# h, w = 12, 12 * 1.5
h, w = 6, 10
plt.figure(figsize=(h, w))
Expand All @@ -54,34 +55,17 @@ def wordcloud(corpus, metric: str = 'tf', max_words: int = 50, word_type: str =
plt.show()


def _wordcloud(corpus, max_words: int, metric: str, word_type: str, stopwords: list[str] = None, lower: bool = True):
def _wordcloud(corpus, max_words: int, metric: str, dtm_name: str, stopwords: list[str] = None):
if stopwords is None: stopwords = list()
stopwords.extend(ENGLISH_STOP_WORDS)
word_types = {'word', 'hashtag', 'mention'}
# word_types = {'word', 'hashtag', 'mention'}
metrics = {'tf', 'tfidf'}
assert word_type in word_types, f"{word_type} not in {', '.join(word_types)}"
assert dtm_name in corpus.dtms.keys(), f"{dtm_name} not in {', '.join(corpus.dtms.keys())}"
assert metric in metrics, f"{metric} not in {', '.join(metrics)}"
wc = WordCloud(background_color='white', max_words=max_words, height=600, width=1200, stopwords=stopwords)

def lower_wrapper(gen) -> Callable:
def generate_lowered(doc):
return (str(x).lower() for x in gen(doc))

return generate_lowered

if word_type == 'word':
dtm = corpus.dtm # corpus dtm is always lower cased.
elif word_type == 'hashtag':
gen = corpus._gen_hashtags_from
if lower: gen = lower_wrapper(gen)
dtm = corpus.create_custom_dtm(tokeniser_func=gen, inplace=False)
elif word_type == 'mention':
gen = corpus._gen_mentions_from
if lower: gen = lower_wrapper(gen)
dtm = corpus.create_custom_dtm(tokeniser_func=gen, inplace=False)
else:
raise ValueError(f"Word type {word_type} is not supported. Must be one of {', '.join(word_types)}")

dtm = corpus.dtms[dtm_name]

if metric == 'tf':
with dtm.without_terms(stopwords) as dtm:
counter = dtm.freq_table().series.to_dict()
Expand Down
Loading

0 comments on commit 9b31a5a

Please # to comment.