1貌踏、 基于網(wǎng)絡(luò)集群識(shí)別的自動(dòng)化聚類
共現(xiàn)關(guān)系聚類,利用社交網(wǎng)絡(luò)分析(Social Network Analysis, SNA)來構(gòu)建知識(shí)圖譜发侵,然后進(jìn)行集群的識(shí)別(Community Detection)肌括,從而給文本基本單元進(jìn)行自動(dòng)分類沈条。
隨便文本代替即可匾委,包括兩列拖叙,一列為文檔名或編號(hào),一列為文本內(nèi)容剩檀。
storagebottles <- read.csv("dataset/ali/storagebottles0905.csv",
header = F) %>%
set_names(c("sku_name", "sku_price", "sku_sale_volume", "sku_score",
"sku_ship", "sku_isNewin", "sku_isPromotion",
"sku_isTopselling", "shop_name", "sku_link", "category4")) %>%
distinct(.keep_all = T)
df <- select(storagebottles, sku_id, sku_name)
# akc用于自動(dòng)化的共現(xiàn)關(guān)系網(wǎng)絡(luò)構(gòu)建和集群識(shí)別
library(pacman)
p_load(dplyr, akc)
# 數(shù)據(jù)清洗
# rmParentheses=T清除小括號(hào)及其小括號(hào)內(nèi)的內(nèi)容
# 清除前后空格
# 清除所有空字符和數(shù)字字符
# 英文轉(zhuǎn)小寫
clean_data <- keyword_clean(df = df,
id = "sku_id",
keyword = "sku_name")
# 關(guān)鍵詞根據(jù)詞干和詞元?dú)w并
# 例如憋沿,“good boy”和“good boys”分別出現(xiàn)了5次和 9次,這兩個(gè)短語具有相同的詞元(“good boy”)
# 所以最后會(huì)被歸并為出現(xiàn)次數(shù)最多的“good boys”
merge_data <- keyword_merge(clean_data)
merge_data
## # A tibble: 1,199 × 2
## id keyword
## <chr> <chr>
## 1 3256803118990386 "0.4l-1.7l stainless steel airtight coffee container storage canister se…
## 2 3256803826538697 "1-100pcs empty silver aluminum tins cans screw top round candle spice t…
## 3 3256804186933336 "1-30ml straight draw perfume refill tools set plastic diffuser syringe …
## 4 3256804187105399 "1-30ml straight draw perfume refill tools set plastic diffuser syringe …
## 5 3256803725515784 "1/10pcs plastic 70/86mm storage cap ribbed lids regular mouth screw cap…
## 6 3256804252879281 "1/2/3/4/5 pcs portable squeeze travel bottle facial bath bottle contain…
## 7 3256804202452859 "1/2/3/5 ml mini glass sample vials perfume bottle laboratory liquid fra…
## 8 3256804206267630 "1/2/3/5 ml mini glass sample vials perfume bottle laboratory liquid fra…
## 9 3256803534413593 "1/2/3/5 ml roll on bottle refillable empty glass essential oils perfume…
## 10 3256803873654482 "1/2/3pcs kitchen gadgets fresh herb keeper container clear spice fridge…
## # … with 1,189 more rows
# 關(guān)鍵詞自動(dòng)分類
# ?tidygraph::group_graph可以尋找更多識(shí)別算法
grouped_data <- keyword_group(merge_data,
# 默認(rèn)集群算法
com_detect_fun = group_fast_greedy,
# 只對(duì)詞頻最大的200個(gè)詞進(jìn)行分類
top = 200)
grouped_data
## # A tbl_graph: 2 nodes and 1 edges
## #
## # An unrooted tree
## #
## # Node Data: 2 × 3 (active)
## name freq group
## <chr> <int> <int>
## 1 2021new chili cans women' 1 1
## 2 s self-defense high concentration anti wolf spray portable self-defense spray … 1 2
## #
## # Edge Data: 1 × 3
## from to n
## <int> <int> <int>
## 1 1 2 1
# 轉(zhuǎn)換為數(shù)據(jù)框
# name列保存的是關(guān)鍵詞信息沪猴,freq列是關(guān)鍵詞的詞頻,而group列則保存了關(guān)鍵詞所屬的類
as_tibble(grouped_data)
## # A tibble: 2 × 3
## name freq group
## <chr> <int> <int>
## 1 2021new chili cans women' 1 1
## 2 s self-defense high concentration anti wolf spray portable self-defense spray … 1 2
# 結(jié)果輸出
# 表格輸出,對(duì)每個(gè)聚類中詞頻最高的10個(gè)關(guān)鍵詞進(jìn)行表格顯示
keyword_table(grouped_data, top = 10)
## # A tibble: 2 × 2
## Group `Keywords (TOP 10)`
## <int> <chr>
## 1 1 2021new chili cans women' (1)
## 2 2 s self-defense high concentration anti wolf spray portable self-defense spray cans (…
# 可視化輸出
keyword_vis(grouped_data)
2采章、基于主題模型的分類
p_load(tidytext, topicmodels)
# 使用akc包帶的數(shù)據(jù)
tidy_data <- keyword_clean(bibli_data_table, lemmatize = F) %>%
keyword_merge(reduce_form = "stem")
# 生成DTM
dtm_data <- count(tidy_data, id, keyword) %>%
cast_dtm(id, keyword, n)
# LDA分析
lda_data <- LDA(dtm_data, k = 2, control = list(seed = 2022))
lda_data
## A LDA_VEM topic model with 2 topics.
# 查看關(guān)鍵詞所屬主題的概率
lda_topic <- tidy(lda_data, matrix = "beta")
lda_topic
## # A tibble: 6,200 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 austerity 0.000314
## 2 2 austerity 0.000430
## 3 1 community capacity 0.000195
## 4 2 community capacity 0.000177
## 5 1 library professionals 0.000612
## 6 2 library professionals 0.000505
## 7 1 public libraries 0.0181
## 8 2 public libraries 0.00948
## 9 1 public service delivery 0.000362
## 10 2 public service delivery 0.0000106
## # … with 6,190 more rows
# 查看每個(gè)主題概率最高的5個(gè)關(guān)鍵詞
topic_terms <- lda_topic %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
p_load(ggplot2)
# 使用條形圖進(jìn)行展示
topic_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
# 判斷文檔屬于哪個(gè)主題
lda_gamma <- tidy(lda_data, matrix = "gamma")
lda_gamma
## # A tibble: 1,942 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 1 1 0.516
## 2 2 1 0.496
## 3 3 1 0.490
## 4 4 1 0.471
## 5 5 1 0.495
## 6 6 1 0.508
## 7 7 1 0.497
## 8 8 1 0.498
## 9 9 1 0.508
## 10 10 1 0.503
## # … with 1,932 more rows
# 每個(gè)文檔中每個(gè)關(guān)鍵詞的數(shù)量及其所屬主題
assignments <- augment(lda_data, data = dtm_data)
assignments
## # A tibble: 5,365 × 4
## document term count .topic
## <chr> <chr> <dbl> <dbl>
## 1 1 austerity 1 2
## 2 719 austerity 1 2
## 3 1 community capacity 1 1
## 4 1 library professionals 1 1
## 5 522 library professionals 1 1
## 6 863 library professionals 1 1
## 7 1 public libraries 1 1
## 8 2 public libraries 1 1
## 9 49 public libraries 1 1
## 10 51 public libraries 1 1
## # … with 5,355 more rows