Create newsletter-parse.R
This commit is contained in:
parent
c2a8d4387c
commit
0d39e4d19a
39
newsletter-parse.R
Normal file
39
newsletter-parse.R
Normal file
|
@ -0,0 +1,39 @@
|
|||
|
||||
# Script to parse email newsletters from environmental groups
|
||||
|
||||
# Note: There doesn't seem to be a good package for ingesting email mbox files in R,
|
||||
# so I have already processed an exported mbox file using python (https://github.com/jarrodparkes/mbox-to-csv)
|
||||
|
||||
# PRELIMINARIES, load libraries and data
|
||||
|
||||
require(tm)
|
||||
require(wordcloud)
|
||||
require(stringr)
|
||||
|
||||
# Start by generating wordcloud from CCF applications stored in CSV
|
||||
|
||||
# Clean data set
|
||||
|
||||
newsletters <- Corpus (DirSource("CCF/"))
|
||||
ccf <- tm_map(ccf, stripWhitespace)
|
||||
ccf <- tm_map(ccf, tolower)
|
||||
ccf <- tm_map(ccf, removeNumbers)
|
||||
ccf <- tm_map(ccf, removePunctuation)
|
||||
|
||||
# tweak by adding extra stopwords, "will"
|
||||
|
||||
myStopwords <- c(stopwords('english'), "will", "project")
|
||||
|
||||
# tweak by removing extra stopwords, in this example "r"
|
||||
# idx <- which(myStopwords == "r")
|
||||
# myStopwords <- myStopwords[-idx]
|
||||
|
||||
ccf <- tm_map(ccf, removeWords, myStopwords)
|
||||
# ccf <- tm_map(ccf, stemDocument)
|
||||
ccf <- tm_map(ccf, PlainTextDocument)
|
||||
|
||||
# Create word cloud
|
||||
|
||||
# wordcloud(ccf, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
|
||||
# wordcloud(ccf, scale=c(8,.2), min.freq=2, max.words=Inf, random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))
|
||||
wordcloud(ccf, scale=c(5,.3), max.words=250, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark3"))
|
Loading…
Reference in a new issue