Skip to contents

A word tokenizer, designed to keep compounded words, like "Covid-19" or URLs

Usage

tokenize_by_words(txt, lower = TRUE, unlist = FALSE)

Arguments

txt

input text

lower

convert words to lowercase. Default TRUE.

unlist

return unlisted text

Examples

txt <- "Was ice-cream better in Soviet Union or in New York? Bla bla, ble. The package https://quanteda.io/reference/index.html is very Nice! There are also compounded words like New_Jersey. The Covid-19 pandemic is very bad. Some,text,like;csv"
tokenize_by_words(txt)
#> [[1]]
#>  [1] "was"                                     
#>  [2] "ice-cream"                               
#>  [3] "better"                                  
#>  [4] "in"                                      
#>  [5] "soviet"                                  
#>  [6] "union"                                   
#>  [7] "or"                                      
#>  [8] "in"                                      
#>  [9] "new"                                     
#> [10] "york"                                    
#> [11] "bla"                                     
#> [12] "bla"                                     
#> [13] "ble"                                     
#> [14] "the"                                     
#> [15] "package"                                 
#> [16] "https://quanteda.io/reference/index.html"
#> [17] "is"                                      
#> [18] "very"                                    
#> [19] "nice"                                    
#> [20] "there"                                   
#> [21] "are"                                     
#> [22] "also"                                    
#> [23] "compounded"                              
#> [24] "words"                                   
#> [25] "like"                                    
#> [26] "new_jersey"                              
#> [27] "the"                                     
#> [28] "covid-19"                                
#> [29] "pandemic"                                
#> [30] "is"                                      
#> [31] "very"                                    
#> [32] "bad"                                     
#> [33] "some"                                    
#> [34] "text"                                    
#> [35] "like"                                    
#> [36] "csv"                                     
#> 

txt2 <- c("Was ice-cream better before?", "The Covid-19 brought many problems.")
tokenize_by_words(txt2)
#> [[1]]
#> [1] "was"       "ice-cream" "better"    "before"   
#> 
#> [[2]]
#> [1] "the"      "covid-19" "brought"  "many"     "problems"
#>