A word tokenizer, designed to keep compounded words, like "Covid-19" or URLs
Examples
txt <- "Was ice-cream better in Soviet Union or in New York? Bla bla, ble. The package https://quanteda.io/reference/index.html is very Nice! There are also compounded words like New_Jersey. The Covid-19 pandemic is very bad. Some,text,like;csv"
tokenize_by_words(txt)
#> [[1]]
#> [1] "was"
#> [2] "ice-cream"
#> [3] "better"
#> [4] "in"
#> [5] "soviet"
#> [6] "union"
#> [7] "or"
#> [8] "in"
#> [9] "new"
#> [10] "york"
#> [11] "bla"
#> [12] "bla"
#> [13] "ble"
#> [14] "the"
#> [15] "package"
#> [16] "https://quanteda.io/reference/index.html"
#> [17] "is"
#> [18] "very"
#> [19] "nice"
#> [20] "there"
#> [21] "are"
#> [22] "also"
#> [23] "compounded"
#> [24] "words"
#> [25] "like"
#> [26] "new_jersey"
#> [27] "the"
#> [28] "covid-19"
#> [29] "pandemic"
#> [30] "is"
#> [31] "very"
#> [32] "bad"
#> [33] "some"
#> [34] "text"
#> [35] "like"
#> [36] "csv"
#>
txt2 <- c("Was ice-cream better before?", "The Covid-19 brought many problems.")
tokenize_by_words(txt2)
#> [[1]]
#> [1] "was" "ice-cream" "better" "before"
#>
#> [[2]]
#> [1] "the" "covid-19" "brought" "many" "problems"
#>