-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtm_movies.R
110 lines (82 loc) · 2.56 KB
/
tm_movies.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
library(rvest)
library(XML)
library(magrittr)
library(tm)
library(wordcloud)
library(wordcloud2)
library(syuzhet)
library(lubridate)
library(ggplot2)
library(scales)
library(reshape2)
library(dplyr)
aurl <- "https://www.imdb.com/title/tt9420648/reviews?ref_=tt_ov_rt"
IMDB_reviews <- NULL
for (i in 1:10){
murl <- read_html(as.character(paste(aurl,i,sep="=")))
rev <- murl %>%
html_nodes(".show-more__control") %>%
html_text()
IMDB_reviews <- c(IMDB_reviews,rev)
}
length(IMDB_reviews)
getwd()
setwd("C:/Users/Aju/Documents/IMDB Analysis")
write.table(IMDB_reviews,"Movies.txt",row.names = F)
Movies <- read.delim('Movies.txt')
str(Aquaman)
library(tm)
corpus <- Movies[-1,]
head(corpus)
class(corpus)
corpus <- Corpus(VectorSource(corpus))
inspect(corpus[1:5])
corpus <- tm_map(corpus,tolower)
inspect(corpus[1:5])
corpus <- tm_map(corpus,removePunctuation)
inspect(corpus[1:5])
corpus <- tm_map(corpus,removeNumbers)
inspect(corpus[1:5])
corpus_clean<-tm_map(corpus,stripWhitespace)
inspect(corpus[1:5])
cleanset<-tm_map(corpus,removeWords, stopwords('english'))
inspect(cleanset[1:5])
removeURL <- function(x) gsub('http[[:alnum:]]*','',x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))
inspect(cleanset[1:5])
cleanset<-tm_map(cleanset,removeWords, c('can','film'))
cleanset<-tm_map(cleanset,removeWords, c('movie','movies'))
cleanset <- tm_map(cleanset, gsub,pattern = 'character', replacement = 'characters')
inspect(cleanset[1:5])
cleanset <- tm_map(cleanset,stripWhitespace)
inspect(cleanset[1:5])
tdm <- TermDocumentMatrix(cleanset)
tdm
tdm <- as.matrix(tdm)
tdm[1:10,1:20]
w <- rowSums(tdm)
w <- subset(w, w>= 50)
barplot(w, las = 2, col = rainbow(50))
w <- sort(rowSums(tdm), decreasing = TRUE)
set.seed(123)
wordcloud(words = names(w), freq = w,
max.words = 250,random.order = F,
min.freq = 3,
colors = brewer.pal(8, 'Dark2'),
scale = c(5,0.3),
rot.per = 0.6)
w <- data.frame(names(w),w)
colnames(w) <- c('word','freq')
wordcloud2(w,size = 0.5, shape = 'triangle', rotateRatio = 0.5,
minSize = 1)
letterCloud(w,word = 'A',frequency(5), size=1)
IMDB_reviews <- read.delim('Movies.txt')
reviews <- as.character(IMDB_reviews[-1,])
class(reviews)
s <- get_nrc_sentiment(reviews)
reviews[4]
get_nrc_sentiment('splendid')
get_nrc_sentiment('no words')
barplot(colSums(s), las = 2.5, col = rainbow(10),
ylab = 'Count',main= 'Sentiment scores for IMDB Reviews
for bala')