R for data science ||使用stringr處理字符串

對于非結(jié)構(gòu)化和半結(jié)構(gòu)化的數(shù)據(jù)，正則表達(dá)式可以用非常簡練的語言來描述字符串中的表達(dá)模式蛮原。第一次見到正則表達(dá)式备徐，你可能會以為這是貓咪在鍵盤上踩出來的，但是隨著逐漸加深對他的理解后爆班，你就會體會其中的深刻含義了衷掷。

str_length(c("a", "R for data science", NA))
[1]  1 18 NA

字符串組合

str_c("x", "y")
#> [1] "xy"
str_c("x", "y", "z")
#> [1] "xyz"
str_c("x", "y", sep = ", ")
#> [1] "x, y"

x <- c("abc", NA)
str_c("|-", x, "-|")
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
#> [1] "|-abc-|" "|-NA-|"

str_c("prefix-", c("a", "b", "c"), "-suffix")
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
#> [1] "Good morning Hadley."

str_c(c("x", "y", "z"), collapse = ", ")
#> [1] "x, y, z"

提取子字符串

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"

str_sub("a", 1, 5)
#> [1] "a"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
#> [1] "apple"  "banana" "pear"

區(qū)域設(shè)置

# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "?"))
#> [1] "I" "I"
str_to_upper(c("i", "?"), locale = "tr")
#> [1] "?" "I"

x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English
#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian
#> [1] "apple"    "eggplant" "banana"

正則表達(dá)式模式匹配

x <- c("apple", "banana", "pear")
str_view(x, "an")
banana

str_view(x, ".a.")
banana
pear

錨定

^ to match the start of the string.
$ to match the end of the string.

x <- c("apple", "banana", "pear")
str_view(x, "^a")
apple

str_view(x, "a$")

banana

字符串類和字符選項(xiàng)

\d: matches any digit.
\s: matches any whitespace (e.g. space, tab, newline).
[abc]: matches a, b, or c.

# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
a.c

str_view(c("grey", "gray"), "gr(e|a)y")
grey
gray

重復(fù)

?: 0 or 1
+: 1 or more
*: 0 or more

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

1888 is the longest year in Roman numerals: MDCCCLXXXVIII

str_view(x, "CC+")

1888 is the longest year in Roman numerals: MDCCCLXXXVIII

str_view(x, 'C[LX]+')

1888 is the longest year in Roman numerals: MDCCCLXXXVIII

{n}: exactly n
{n,}: n or more
{,m}: at most m
{n,m}: between n and m
str_view(x, "C{2}")

分組與回溯引用

str_view(fruit, "(..)\\1", match = TRUE)

banana
coconut
cucumber
jujube
papaya
salal berry

匹配檢測

x <- c("apple", "banana", "pear")
str_detect(x, "e")
#> [1]  TRUE FALSE  TRUE

# How many common words start with t?
sum(str_detect(words, "^t"))
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
#> [1] 0.277

# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
#> [1] TRUE

words[str_detect(words, "x$")]
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
#> [1] "box" "sex" "six" "tax"

df <- tibble(
  word = words, 
  i = seq_along(word)
)
df %>% 
  filter(str_detect(word, "x$"))
#> # A tibble: 4 x 2
#>   word      i
#>   <chr> <int>
#> 1 box     108
#> 2 sex     747
#> 3 six     772
#> 4 tax     841

x <- c("apple", "banana", "pear")
str_count(x, "a")
#> [1] 1 3 1

# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
#> [1] 1.99

df %>% 
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )
#> # A tibble: 980 x 4
#>   word         i vowels consonants
#>   <chr>    <int>  <int>      <int>
#> 1 a            1      1          0
#> 2 able         2      2          2
#> 3 about        3      3          2
#> 4 absolute     4      4          4
#> 5 accept       5      2          4
#> 6 account      6      3          4
#> # … with 974 more rows

str_count("abababa", "aba")
#> [1] 2
str_view_all("abababa", "aba")

aba b aba

提取匹配內(nèi)容

length(sentences)
#> [1] 720
head(sentences)
#> [1] "The birch canoe slid on the smooth planks." 
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."     
#> [4] "These days a chicken leg is a rare dish."   
#> [5] "Rice is often served in round bowls."       
#> [6] "The juice of lemons makes fine punch."

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
#> [1] "red|orange|yellow|green|blue|purple"

has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
#> [1] "blue" "blue" "red"  "red"  "red"  "blue"

more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)

It is hard to erase blue or red ink.
Thegreen light in the brown box flickered.
The sky in the west is tinged with orange red.

str_extract(more, colour_match)
#> [1] "blue"   "green"  "orange"

分組匹配

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)
#>  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
#>  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )
#> # A tibble: 720 x 3
#>   sentence                                    article noun   
#>   <chr>                                       <chr>   <chr>  
#> 1 The birch canoe slid on the smooth planks.  the     smooth 
#> 2 Glue the sheet to the dark blue background. the     sheet  
#> 3 It's easy to tell the depth of a well.      the     depth  
#> 4 These days a chicken leg is a rare dish.    a       chicken
#> 5 Rice is often served in round bowls.        <NA>    <NA>   
#> 6 The juice of lemons makes fine punch.       <NA>    <NA>   
#> # … with 714 more rows

替換匹配內(nèi)容

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
#> [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
#> [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
#> [1] "one house"    "two cars"     "three people"

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)
#> [1] "The canoe birch slid on the smooth planks." 
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."     
#> [4] "These a days chicken leg is a rare dish."   
#> [5] "Rice often is served in round bowls."

拆分

sentences %>%
  head(5) %>% 
  str_split(" ")
#> [[1]]
#> [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
#> [8] "planks."
#> 
#> [[2]]
#> [1] "Glue"        "the"         "sheet"       "to"          "the"        
#> [6] "dark"        "blue"        "background."
#> 
#> [[3]]
#> [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
#> 
#> [[4]]
#> [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
#> [8] "rare"    "dish."  
#> 
#> [[5]]
#> [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]
#> [1] "a" "b" "c" "d"

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)
#>      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]    
#> [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth"
#> [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"  
#> [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"    
#> [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"     
#> [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls."
#>      [,8]          [,9]   
#> [1,] "planks."     ""     
#> [2,] "background." ""     
#> [3,] "a"           "well."
#> [4,] "rare"        "dish."
#> [5,] ""            ""

r4ds

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末，一起剝皮案震驚了整個濱河市柿菩，隨后出現(xiàn)的幾起案子戚嗅，更是在濱河造成了極大的恐慌，老刑警劉巖枢舶，帶你破解...
沈念sama閱讀 218,284評論 6贊 506
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件懦胞，死亡現(xiàn)場離奇詭異，居然都是意外死亡凉泄，警方通過查閱死者的電腦和手機(jī)躏尉，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 93,115評論 3贊 395
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門，熙熙樓的掌柜王于貴愁眉苦臉地迎上來后众，“玉大人胀糜，你說我怎么就攤上這事颅拦。” “怎么了教藻？”我有些...
開封第一講書人閱讀 164,614評論 0贊 354
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵距帅，是天一觀的道長。經(jīng)常有香客問我括堤，道長碌秸，這世上最難降的妖魔是什么？我笑而不...
開封第一講書人閱讀 58,671評論 1贊 293
?港島之戀（遺憾婚禮）
正文為了忘掉前任悄窃，我火速辦了婚禮哮肚，結(jié)果婚禮上，老公的妹妹穿的比我還像新娘广匙。我一直安慰自己，他們只是感情好恼策，可當(dāng)我...
茶點(diǎn)故事閱讀 67,699評論 6贊 392
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布鸦致。她就那樣靜靜地躺著，像睡著了一般涣楷。火紅的嫁衣襯著肌膚如雪分唾。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 51,562評論 1贊 305
城市分裂傳說
那天狮斗，我揣著相機(jī)與錄音绽乔，去河邊找鬼。笑死碳褒，一個胖子當(dāng)著我的面吹牛折砸，可吹牛的內(nèi)容都是我干的。我是一名探鬼主播沙峻，決...
沈念sama閱讀 40,309評論 3贊 418
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼睦授，長吁一口氣：“原來是場噩夢啊……” “哼！你這毒婦竟也來了摔寨？” 一聲冷哼從身側(cè)響起去枷，我...
開封第一講書人閱讀 39,223評論 0贊 276
萬榮殺人案實(shí)錄
序言：老撾萬榮一對情侶失蹤，失蹤者是張志新（化名）和其女友劉穎是复，沒想到半個月后删顶，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體，經(jīng)...
沈念sama閱讀 45,668評論 1贊 314
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡淑廊，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 37,859評論 3贊 336
?白月光啟示錄
正文我和宋清朗相戀三年逗余，在試婚紗的時候發(fā)現(xiàn)自己被綠了。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片季惩。...
茶點(diǎn)故事閱讀 39,981評論 1贊 348
活死人
序言：一個原本活蹦亂跳的男人離奇死亡猎荠，死狀恐怖坚弱，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情关摇，我是刑警寧澤荒叶，帶...
沈念sama閱讀 35,705評論 5贊 347
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站输虱，受9級特大地震影響些楣，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜宪睹，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 41,310評論 3贊 330
男人毒藥：我在死后第九天來索命
文/蒙蒙一愁茁、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧亭病，春花似錦鹅很、人聲如沸。這莊子的主人今日做“春日...
開封第一講書人閱讀 31,904評論 0贊 22
一樁弒父案促煮，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至整袁，卻和暖如春菠齿，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背坐昙。一陣腳步聲響...
開封第一講書人閱讀 33,023評論 1贊 270
情欲美人皮
我被黑心中介騙來泰國打工绳匀，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留，地道東北人炸客。一個月前我還...
沈念sama閱讀 48,146評論 3贊 370
代替公主和親
正文我出身青樓疾棵，卻偏偏與公主長得像，于是被迫代替她去往敵國和親痹仙。傳聞我的和親對象是個殘疾皇子陋桂，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 44,933評論 2贊 355