vignettes/stopwords.Rmd
stopwords.Rmd
This functions is still under development At this time you can generate a stopwords list in Portuguese and English.
But none is categorized allowing a more fine grained control of the words by grammar categories.
Choose the language and which grammar tags use to generate a stopword
list. The lang
parameter uses the two word designation, and
the cat
designs the grammar category following Penn
Treebank. See here
POS Tag | Description | Example | Description |
---|---|---|---|
CC | coordinating conjunction | and | |
CD | cardinal number | one, third | |
DT | determiner | the | |
EX | existential | there there is | |
PP | pronoum | me, you, he, she, it, we, they | |
PPZ | pronoum | ||
UH | interjection | oops! | |
V | verb | are be have | variations of verbs to be, have, |
gen_stopwords()
#> [1] "e" "ou" "mas" "que"
#> [5] "porque" "por que" "porquê" "por quê"
#> [9] "se" "como" "primeiro" "segundo"
#> [13] "terceiro" "quarto" "quinto" "sexto"
#> [17] "sétimo" "oitavo" "nono" "décimo"
#> [21] "vigésimo" "trigésimo" "quadrigésimo" "quinquagésimo"
#> [25] "sexagésimo" "septuagésimo" "setuagésimo" "octogésimo"
#> [29] "nonagésimo" "centésimo" "ducentésimo" "trecentésimo"
#> [33] "quadringentésimo" "quingentésimo" "seiscentésimo" "sexcentésimo"
#> [37] "septingentésimo" "setingentésimo" "octingentésimo" "nongentésimo"
#> [41] "milésimo" "milionésimo" "bilionésimo"
gen_stopwords(lang = "en", categories = "CC")
#> [1] "and" "but" "or" "for" "yet" "so"
gen_stopwords(lang = "pt", categories = "CC DT")
#> [1] "e" "ou" "mas" "que" "porque" "por que" "porquê"
#> [8] "por quê" "se" "como" "o" "a" "os" "as"
# to convert the list into a named vector
gen_stopwords(lang = "en", categories = "CC DT V")
#> [1] "and" "but" "or" "for" "yet" "so" "a" "the" "this"
#> [10] "am" "are" "is" "be" "can" "could" "did" "do" "have"
#> [19] "he" "it" "may" "might" "must" "need" "no" "not" "now"
#> [28] "of" "on" "she" "that" "to" "was" "were"
gen_stopwords(lang = "pt")
#> [1] "e" "ou" "mas" "que"
#> [5] "porque" "por que" "porquê" "por quê"
#> [9] "se" "como" "primeiro" "segundo"
#> [13] "terceiro" "quarto" "quinto" "sexto"
#> [17] "sétimo" "oitavo" "nono" "décimo"
#> [21] "vigésimo" "trigésimo" "quadrigésimo" "quinquagésimo"
#> [25] "sexagésimo" "septuagésimo" "setuagésimo" "octogésimo"
#> [29] "nonagésimo" "centésimo" "ducentésimo" "trecentésimo"
#> [33] "quadringentésimo" "quingentésimo" "seiscentésimo" "sexcentésimo"
#> [37] "septingentésimo" "setingentésimo" "octingentésimo" "nongentésimo"
#> [41] "milésimo" "milionésimo" "bilionésimo"
gen_stopwords(lang = "pt", categories = "V")
#> [1] "ser" "sou" "sois" "é" "és"
#> [6] "somos" "são" "era" "eram" "éramos"
#> [11] "serei" "será" "serão" "serás" "fui"
#> [16] "foste" "foi" "fomos" "fostes" "foram"
#> [21] "eras" "éreis" "seremos" "sereis" "seja"
#> [26] "sejam" "estar" "estou" "estás" "está"
#> [31] "estamos" "estais" "estão" "estive" "estiveste"
#> [36] "esteve" "estivemos" "estivestes" "estiveram" "estava"
#> [41] "estavas" "estávamos" "estáveis" "estavam" "estarei"
#> [46] "estarás" "estará" "estaremos" "estareis" "estarão"
#> [51] "esteja" "estejam" "ter" "tenham" "têem"
#> [56] "tenho" "tens" "tem" "temos" "tendes"
#> [61] "têm" "tive" "tiveste" "teve" "tivemos"
#> [66] "tivestes" "tiveram" "tinha" "tinhas" "tínhamos"
#> [71] "tínheis" "tinham" "terei" "terás" "terá"
#> [76] "teremos" "tereis" "terão" "teria" "teriam"
#> [81] "haver" "houve" "haveria" "haveriam" "hei"
#> [86] "hás" "há" "havemos" "haveis" "hão"
#> [91] "houver" "houveres" "houvermos" "houverdes" "houverem"
#> [96] "havia" "havias" "havíamos" "havíeis" "haviam"
#> [101] "haverei" "haverás" "haverá" "haveremos" "havereis"
#> [106] "haverão" "haja" "hajam" "houvera" "houveram"
#> [111] "houvesse"
With the vec
parameter, it is possible to have three
different output formats: list, vector (default) and named vector.
gen_stopwords(lang = "pt", categories = "V", vec = "list")
#> $V
#> $V$ser
#> [1] "ser" "sou" "sois" "é" "és" "somos" "sois"
#> [8] "são" "era" "eram" "éramos" "serei" "será" "serão"
#> [15] "serás" "fui" "foste" "foi" "fomos" "fostes" "foram"
#> [22] "era" "eras" "era" "éramos" "éreis" "eram" "serei"
#> [29] "serás" "será" "seremos" "sereis" "serão" "seja" "sejam"
#>
#> $V$estar
#> [1] "estar" "estou" "estás" "está" "estamos"
#> [6] "estais" "estão" "estive" "estiveste" "esteve"
#> [11] "estivemos" "estivestes" "estiveram" "estava" "estavas"
#> [16] "estava" "estávamos" "estáveis" "estavam" "estarei"
#> [21] "estarás" "estará" "estaremos" "estareis" "estarão"
#> [26] "esteja" "estejam"
#>
#> $V$ter
#> [1] "ter" "tenham" "têem" "tenho" "tens" "tem"
#> [7] "temos" "tendes" "têm" "tive" "tiveste" "teve"
#> [13] "tivemos" "tivestes" "tiveram" "tinha" "tinhas" "tinha"
#> [19] "tínhamos" "tínheis" "tinham" "terei" "terás" "terá"
#> [25] "teremos" "tereis" "terão" "teria" "teriam"
#>
#> $V$haver
#> [1] "haver" "houve" "haveria" "haveriam" "hei" "hás"
#> [7] "há" "havemos" "haveis" "hão" "houver" "houveres"
#> [13] "houver" "houvermos" "houverdes" "houverem" "havia" "havias"
#> [19] "havia" "havíamos" "havíeis" "haviam" "haverei" "haverás"
#> [25] "haverá" "haveremos" "havereis" "haverão" "haja" "hajam"
#> [31] "houvera" "houveram" "houvesse"
#>
#>
#> $included
#> character(0)
gen_stopwords(lang = "pt", categories = "V", vec = "vec")
#> [1] "ser" "sou" "sois" "é" "és"
#> [6] "somos" "são" "era" "eram" "éramos"
#> [11] "serei" "será" "serão" "serás" "fui"
#> [16] "foste" "foi" "fomos" "fostes" "foram"
#> [21] "eras" "éreis" "seremos" "sereis" "seja"
#> [26] "sejam" "estar" "estou" "estás" "está"
#> [31] "estamos" "estais" "estão" "estive" "estiveste"
#> [36] "esteve" "estivemos" "estivestes" "estiveram" "estava"
#> [41] "estavas" "estávamos" "estáveis" "estavam" "estarei"
#> [46] "estarás" "estará" "estaremos" "estareis" "estarão"
#> [51] "esteja" "estejam" "ter" "tenham" "têem"
#> [56] "tenho" "tens" "tem" "temos" "tendes"
#> [61] "têm" "tive" "tiveste" "teve" "tivemos"
#> [66] "tivestes" "tiveram" "tinha" "tinhas" "tínhamos"
#> [71] "tínheis" "tinham" "terei" "terás" "terá"
#> [76] "teremos" "tereis" "terão" "teria" "teriam"
#> [81] "haver" "houve" "haveria" "haveriam" "hei"
#> [86] "hás" "há" "havemos" "haveis" "hão"
#> [91] "houver" "houveres" "houvermos" "houverdes" "houverem"
#> [96] "havia" "havias" "havíamos" "havíeis" "haviam"
#> [101] "haverei" "haverás" "haverá" "haveremos" "havereis"
#> [106] "haverão" "haja" "hajam" "houvera" "houveram"
#> [111] "houvesse"
gen_stopwords(lang = "pt", categories = "V", vec = "n_vec")
#> V.ser1 V.ser2 V.ser3 V.ser4 V.ser5 V.ser6
#> "ser" "sou" "sois" "é" "és" "somos"
#> V.ser7 V.ser8 V.ser9 V.ser10 V.ser11 V.ser12
#> "sois" "são" "era" "eram" "éramos" "serei"
#> V.ser13 V.ser14 V.ser15 V.ser16 V.ser17 V.ser18
#> "será" "serão" "serás" "fui" "foste" "foi"
#> V.ser19 V.ser20 V.ser21 V.ser22 V.ser23 V.ser24
#> "fomos" "fostes" "foram" "era" "eras" "era"
#> V.ser25 V.ser26 V.ser27 V.ser28 V.ser29 V.ser30
#> "éramos" "éreis" "eram" "serei" "serás" "será"
#> V.ser31 V.ser32 V.ser33 V.ser34 V.ser35 V.estar1
#> "seremos" "sereis" "serão" "seja" "sejam" "estar"
#> V.estar2 V.estar3 V.estar4 V.estar5 V.estar6 V.estar7
#> "estou" "estás" "está" "estamos" "estais" "estão"
#> V.estar8 V.estar9 V.estar10 V.estar11 V.estar12 V.estar13
#> "estive" "estiveste" "esteve" "estivemos" "estivestes" "estiveram"
#> V.estar14 V.estar15 V.estar16 V.estar17 V.estar18 V.estar19
#> "estava" "estavas" "estava" "estávamos" "estáveis" "estavam"
#> V.estar20 V.estar21 V.estar22 V.estar23 V.estar24 V.estar25
#> "estarei" "estarás" "estará" "estaremos" "estareis" "estarão"
#> V.estar26 V.estar27 V.ter1 V.ter2 V.ter3 V.ter4
#> "esteja" "estejam" "ter" "tenham" "têem" "tenho"
#> V.ter5 V.ter6 V.ter7 V.ter8 V.ter9 V.ter10
#> "tens" "tem" "temos" "tendes" "têm" "tive"
#> V.ter11 V.ter12 V.ter13 V.ter14 V.ter15 V.ter16
#> "tiveste" "teve" "tivemos" "tivestes" "tiveram" "tinha"
#> V.ter17 V.ter18 V.ter19 V.ter20 V.ter21 V.ter22
#> "tinhas" "tinha" "tínhamos" "tínheis" "tinham" "terei"
#> V.ter23 V.ter24 V.ter25 V.ter26 V.ter27 V.ter28
#> "terás" "terá" "teremos" "tereis" "terão" "teria"
#> V.ter29 V.haver1 V.haver2 V.haver3 V.haver4 V.haver5
#> "teriam" "haver" "houve" "haveria" "haveriam" "hei"
#> V.haver6 V.haver7 V.haver8 V.haver9 V.haver10 V.haver11
#> "hás" "há" "havemos" "haveis" "hão" "houver"
#> V.haver12 V.haver13 V.haver14 V.haver15 V.haver16 V.haver17
#> "houveres" "houver" "houvermos" "houverdes" "houverem" "havia"
#> V.haver18 V.haver19 V.haver20 V.haver21 V.haver22 V.haver23
#> "havias" "havia" "havíamos" "havíeis" "haviam" "haverei"
#> V.haver24 V.haver25 V.haver26 V.haver27 V.haver28 V.haver29
#> "haverás" "haverá" "haveremos" "havereis" "haverão" "haja"
#> V.haver30 V.haver31 V.haver32 V.haver33
#> "hajam" "houvera" "houveram" "houvesse"
To use only certain kinds of verbs, like only the
variations/conjugations of the Portuguese verb ser
:
my_sw <- gen_stopwords("pt", "V", vec = "list")
my_sw$V$ser
#> [1] "ser" "sou" "sois" "é" "és" "somos" "sois"
#> [8] "são" "era" "eram" "éramos" "serei" "será" "serão"
#> [15] "serás" "fui" "foste" "foi" "fomos" "fostes" "foram"
#> [22] "era" "eras" "era" "éramos" "éreis" "eram" "serei"
#> [29] "serás" "será" "seremos" "sereis" "serão" "seja" "sejam"
# or shorter
gen_stopwords("pt", "V", vec = "list")$V$ser
#> [1] "ser" "sou" "sois" "é" "és" "somos" "sois"
#> [8] "são" "era" "eram" "éramos" "serei" "será" "serão"
#> [15] "serás" "fui" "foste" "foi" "fomos" "fostes" "foram"
#> [22] "era" "eras" "era" "éramos" "éreis" "eram" "serei"
#> [29] "serás" "será" "seremos" "sereis" "serão" "seja" "sejam"
To see all the categories and its respective terms, run the following code
show_sw("en")
#> $IN
#> [1] "of" "for" "in" "by"
#>
#> $DT
#> [1] "a" "the" "this"
#>
#> $CC
#> [1] "and" "but" "or" "for" "but" "yet" "so"
#>
#> $CD
#> [1] "zero" "one" "two" "three" "four" "five" "first"
#> [8] "second" "third" "fourth" "fifth" "sixth" "seventh" "eighth"
#> [15] "ninth" "tenth"
#>
#> $JJ
#> [1] "blue" "happy" "sad"
#>
#> $JJR
#> [1] "bluer" "happier"
#>
#> $JJS
#> [1] "bluest" "happiest"
#>
#> $MD
#> [1] "could" "will"
#>
#> $PP
#> [1] "I" "you" "he" "she" "it" "we"
#>
#> $PRP
#> [1] "I" "you" "he" "she" "we"
#>
#> $PPZ
#> [1] "your" "my" "mine" "ours" "his" "her"
#>
#> $RB
#> [1] "however" "usually" "naturally" "here" "good"
#>
#> $RBR
#> [1] "better"
#>
#> $UH
#> [1] "aha"
#>
#> $RP
#> [1] "up" "off"
#>
#> $V
#> [1] "am" "are" "is" "be" "can" "could" "did" "do" "have"
#> [10] "he" "is" "it" "may" "might" "must" "need" "no" "not"
#> [19] "now" "of" "on" "or" "she" "that" "the" "to" "was"
#> [28] "were"
#>
#> $question
#> [1] "what" "when" "where" "who" "whom" "why" "because"
#>
#> $PT
#> [1] "Mr." "Mrs." "Miss" "Ms." "Sir" "Madam" "Dr." "Prof."
show_sw("en", as_vector = TRUE)
#> [1] "a" "aha" "am" "and" "are" "be"
#> [7] "because" "better" "blue" "bluer" "bluest" "but"
#> [13] "by" "can" "could" "did" "do" "Dr."
#> [19] "eighth" "fifth" "first" "five" "for" "four"
#> [25] "fourth" "good" "happier" "happiest" "happy" "have"
#> [31] "he" "her" "here" "his" "however" "I"
#> [37] "in" "is" "it" "Madam" "may" "might"
#> [43] "mine" "Miss" "Mr." "Mrs." "Ms." "must"
#> [49] "my" "naturally" "need" "ninth" "no" "not"
#> [55] "now" "of" "off" "on" "one" "or"
#> [61] "ours" "Prof." "sad" "second" "seventh" "she"
#> [67] "Sir" "sixth" "so" "tenth" "that" "the"
#> [73] "third" "this" "three" "to" "two" "up"
#> [79] "usually" "was" "we" "were" "what" "when"
#> [85] "where" "who" "whom" "why" "will" "yet"
#> [91] "you" "your" "zero"
show_sw("pt", as_vector = TRUE)
#> [1] "a" "ah" "ai"
#> [4] "aquele" "as" "assim"
#> [7] "bilionésimo" "bosta" "caralho"
#> [10] "caramba" "centésimo" "cocô"
#> [13] "com" "como" "concluindo"
#> [16] "Conclusão" "conosco" "consectário"
#> [19] "consequentemente" "consigo" "convosco"
#> [22] "credo" "da" "das"
#> [25] "de" "décimo" "decorre"
#> [28] "dela" "delas" "dele"
#> [31] "deles" "depreende-se" "desse modo"
#> [34] "do" "dos" "Doutor"
#> [37] "Doutora" "Doutores" "Dr."
#> [40] "Dra." "ducentésimo" "e"
#> [43] "é" "eh" "ei"
#> [46] "ela" "elas" "ele"
#> [49] "eles" "em" "em suma"
#> [52] "em vista disso" "enfim" "então"
#> [55] "era" "eram" "éramos"
#> [58] "eras" "éreis" "és"
#> [61] "esse" "está" "estais"
#> [64] "estamos" "estão" "estar"
#> [67] "estará" "estarão" "estarás"
#> [70] "estarei" "estareis" "estaremos"
#> [73] "estás" "estava" "estavam"
#> [76] "estávamos" "estavas" "estáveis"
#> [79] "este" "esteja" "estejam"
#> [82] "esteve" "estive" "estivemos"
#> [85] "estiveram" "estiveste" "estivestes"
#> [88] "estou" "eu" "Excelentíssima"
#> [91] "Excelentíssimo" "Excelentíssimos" "Exmo."
#> [94] "Exmos." "finalmente" "fiu"
#> [97] "foi" "fomos" "foram"
#> [100] "foste" "fostes" "fui"
#> [103] "há" "haja" "hajam"
#> [106] "hão" "hás" "haveis"
#> [109] "havemos" "haver" "haverá"
#> [112] "haverão" "haverás" "haverei"
#> [115] "havereis" "haveremos" "haveria"
#> [118] "haveriam" "havia" "haviam"
#> [121] "havíamos" "havias" "havíeis"
#> [124] "hei" "houve" "houver"
#> [127] "houvera" "houveram" "houverdes"
#> [130] "houverem" "houveres" "houvermos"
#> [133] "houvesse" "isso" "isto"
#> [136] "lhe" "logo" "mas"
#> [139] "me" "meo" "merda"
#> [142] "meu" "milésimo" "milionésimo"
#> [145] "mim" "na" "né"
#> [148] "neh" "no" "nonagésimo"
#> [151] "nongentésimo" "nono" "nos"
#> [154] "nós" "nossa" "nossas"
#> [157] "nosso" "nossos" "o"
#> [160] "octingentésimo" "octogésimo" "oh"
#> [163] "oitavo" "opa" "os"
#> [166] "ou" "ou seja" "para"
#> [169] "pois" "por" "por isso"
#> [172] "por que" "por quê" "porque"
#> [175] "porquê" "porra" "portanto"
#> [178] "pqp" "primeiro" "psiu"
#> [181] "quadrigésimo" "quadringentésimo" "quarto"
#> [184] "que" "quingentésimo" "quinquagésimo"
#> [187] "quinto" "Rev.ma" "são"
#> [190] "se" "segue-se" "segundo"
#> [193] "seiscentésimo" "seja" "sejam"
#> [196] "sem" "Senhor" "Senhoras"
#> [199] "Senhores" "septingentésimo" "septuagésimo"
#> [202] "ser" "será" "serão"
#> [205] "serás" "serei" "sereis"
#> [208] "seremos" "sétimo" "setingentésimo"
#> [211] "setuagésimo" "seu" "sexagésimo"
#> [214] "sexcentésimo" "sexto" "si"
#> [217] "sois" "somos" "sou"
#> [220] "Sr." "Sra." "sua"
#> [223] "te" "têem" "tem"
#> [226] "têm" "temos" "tendes"
#> [229] "tenham" "tenho" "tens"
#> [232] "ter" "terá" "terão"
#> [235] "terás" "terceiro" "terei"
#> [238] "tereis" "teremos" "teria"
#> [241] "teriam" "teu" "teve"
#> [244] "ti" "tinha" "tinham"
#> [247] "tínhamos" "tinhas" "tínheis"
#> [250] "tive" "tivemos" "tiveram"
#> [253] "tiveste" "tivestes" "trecentésimo"
#> [256] "trigésimo" "tu" "uhu"
#> [259] "ui" "uou" "V.A."
#> [262] "V.Ex." "V.Ex.a" "V.Ex.as"
#> [265] "V.Mag" "V.S." "V.S.a"
#> [268] "vigésimo" "você" "vocês"
#> [271] "vos" "vós" "vtnc"
#> [274] "VV.AA." "wow"