Data Science Manager - Accenture
M.S. in Predictive Analytics - DePaul University
Me
Malter Analytics
GitHub
LinkedIn
YouTube Channel
Twitter
Kaggle
Other Work
General Assembly
AriBall
Media
Built In
This analysis is inspired by the FiveThirtyEight post, The Gone Girl With The Dragon Tattoo On The Train, in which the authors asks the question, “why are there so many books with ‘girl’ in the title?”. The goal of this analysis is to replicate the code used to collect reading data from Goodreads.com. To measure popularity, I am assuming that the more often a book is rated on Goodreads, whether a positive or negative rating, it is more popular becasue it means that the book was at least read.
# Load in the required packages
library(rvest)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(scales)
library(knitr)
library(kableExtra)
# initialize tables
girl.title <- NULL
boy.title <- NULL
#girl.author <- NULL
#boy.author <- NULL
girl.rating <- NULL
boy.rating <- NULL
girl.published <- NULL
boy.published <- NULL
girl.editions <- NULL
boy.editions <- NULL
# Find URL
for(i in 1:100){
girl.url <- paste('https://www.goodreads.com/search?page=',i,'&q=girl&search%5Bfield%5D=title&search_type=books&tab=books&utf8=%E2%9C%93', sep='')
girl.webpage <- read_html(girl.url)
boy.url <- paste('https://www.goodreads.com/search?page=',i,'&q=boy&search%5Bfield%5D=title&search_type=books&tab=books&utf8=%E2%9C%93', sep='')
boy.webpage <- read_html(boy.url)
### Title ###
# grab girl title
girl.title.table <- html_nodes(girl.webpage,'.bookTitle') %>%
html_text() %>%
str_replace_all("[\r\n]" , "") %>%
str_trim(side = "both") %>%
as.data.frame()
# bind to dataframe
girl.title <- rbind(girl.title, girl.title.table)
# grab boy title
boy.title.table <- html_nodes(boy.webpage,'.bookTitle') %>%
html_text() %>%
str_replace_all("[\r\n]" , "") %>%
str_trim(side = "both") %>%
as.data.frame()
# bind to dataframe
boy.title <- rbind(boy.title, boy.title.table)
# ### Author ###
# #grab girl author
# girl.author.table <- html_nodes(girl.webpage,'.tableList') %>%
# html_text() %>%
# as.data.frame()
#
# #bind to dataframe
# girl.author <- rbind(girl.author, girl.author.table)
#
# #grab boy author
# boy.author.table <- html_nodes(boy.webpage,'.tableList') %>%
# html_text() %>%
# as.data.frame()
#
# #bind to dataframe
# boy.author <- rbind(boy.author, boy.author.table)
### Rating ###
# grab firl rating
girl.rating.table <- html_nodes(girl.webpage,'.minirating') %>%
html_text() %>%
str_trim(side = "both") %>%
as.data.frame()
# bind to dataframe
girl.rating <- rbind(girl.rating, girl.rating.table)
# grab boy rating
boy.rating.table <- html_nodes(boy.webpage,'.minirating') %>%
html_text() %>%
str_trim(side = "both") %>%
as.data.frame()
# bind to dataframe
boy.rating <- rbind(boy.rating, boy.rating.table)
### Published ###
# grab girl published
girl.published.table <- html_nodes(girl.webpage,'.uitext') %>%
html_text() %>%
as.data.frame()
# bind to dataframe
girl.published <- rbind(girl.published, girl.published.table)
# grab boy published
boy.published.table <- html_nodes(boy.webpage,'.uitext') %>%
html_text() %>%
as.data.frame()
# bind to dataframe
boy.published <- rbind(boy.published, boy.published.table)
### Editions ###
# grab girl editions
girl.editions.table <- html_nodes(girl.webpage,'.greyText a') %>%
html_text() %>%
as.data.frame()
# bind to dataframe
girl.editions <- rbind(girl.editions, girl.editions.table)
# grab boy editions
boy.editions.table <- html_nodes(boy.webpage,'.greyText a') %>%
html_text() %>%
as.data.frame()
# bind to dataframe
boy.editions <- rbind(boy.editions, boy.editions.table)
}
# clean title
names(girl.title)[1] <- "girl.title"
names(boy.title)[1] <- "boy.title"
# clean author
# names(girl.author)[1] <- "author"
# names(boy.author)[1] <- "author"
#
# girl.author <- str_replace_all(girl.author$author, "[\r\n]" , "")
# girl.author <- data.frame( do.call( cbind, strsplit( girl.author, 'by' ) ) )
# girl.author <- girl.author[-1,] # remove first row
# girl.author <- gather(girl.author)
# names(girl.author)[names(girl.author) == 'value'] <- 'author'
# girl.author <- sub('avg rating — .*', '', girl.author$author)
# girl.author <- gsub('[[:digit:]]+', '', girl.author)
# girl.author <- gsub("[.]","",girl.author)
# girl.author <- sub(".(Goodreads Author).*", " \\", girl.author)
# girl.author <- sub(").*", " \\", girl.author)
# girl.author <- str_trim(girl.author, side = "both")
# girl.author <- as.data.frame(girl.author)
# girl.author <- girl.author[!apply(girl.author == "", 1, all),]
# girl.author <- gsub(",.*$", "", girl.author)
#
# boy.author <- str_replace_all(boy.author$author, "[\r\n]" , "")
# boy.author <- data.frame( do.call( cbind, strsplit( boy.author, 'by' ) ) )
# boy.author <- boy.author[-1,] # remove first row
# boy.author <- gather(boy.author)
# names(boy.author)[names(boy.author) == 'value'] <- 'author'
# boy.author <- sub('avg rating — .*', '', boy.author$author)
# boy.author <- gsub('[[:digit:]]+', '', boy.author)
# boy.author <- gsub("[.]","",boy.author)
# boy.author <- sub(".(Goodreads Author).*", " \\", boy.author)
# boy.author <- sub(").*", " \\", boy.author)
# boy.author <- str_trim(boy.author, side = "both")
# boy.author <- as.data.frame(boy.author)
# boy.author <- boy.author[!apply(boy.author == "", 1, all),]
# boy.author <- gsub(",.*$", "", boy.author)
# clean rating
names(girl.rating)[1] <- "rating"
names(boy.rating)[1] <- "rating"
girl.avg.rating <- sub(' avg rating.*', '', girl.rating$rating)
girl.total.ratings <- sub('.*rating — ', '', girl.rating$rating)
girl.total.ratings <- sub(' .*', '', girl.total.ratings)
girl.total.ratings <- gsub(',', '', girl.total.ratings)
boy.avg.rating <- sub(' avg rating.*', '', boy.rating$rating)
boy.total.ratings <- sub('.*rating — ', '', boy.rating$rating)
boy.total.ratings <- sub(' .*', '', boy.total.ratings)
boy.total.ratings <- gsub(',', '', boy.total.ratings)
# clean published
names(girl.published)[1] <- "published"
names(boy.published)[1] <- "published"
girl.published <- gsub('*\n[A-z ]*', '' , girl.published$published)
girl.published <- girl.published[lapply(girl.published,function(x) length(grep("Clear rating",x,value=FALSE))) == 0]
girl.published <- girl.published[lapply(girl.published,function(x) length(grep("Rate this book",x,value=FALSE))) == 0]
girl.published <- sub('.*ratings—', '', girl.published)
girl.published <- sub('—.*', '', girl.published)
girl.published <- ifelse(grepl("edition", girl.published), NA, girl.published) # some books don't contain published years
girl.published <- ifelse(grepl("rating", girl.published), NA, girl.published)
boy.published <- gsub('*\n[A-z ]*', '' , boy.published$published)
boy.published <- boy.published[lapply(boy.published,function(x) length(grep("Clear rating",x,value=FALSE))) == 0]
boy.published <- boy.published[lapply(boy.published,function(x) length(grep("Rate this book",x,value=FALSE))) == 0]
boy.published <- sub('.*ratings—', '', boy.published)
boy.published <- sub('—.*', '', boy.published)
boy.published <- ifelse(grepl("edition", boy.published), NA, boy.published) # some books don't contain published years
boy.published <- ifelse(grepl("rating", boy.published), NA, boy.published)
# clean editions
names(girl.editions)[1] <- "editions"
names(boy.editions)[1] <- "editions"
girl.editions <- sub(' .*', '', girl.editions$editions)
boy.editions <- sub(' .*', '', boy.editions$editions)
### Combine into Dataframe ###
girl <- as.data.frame(cbind(girl.title, girl.avg.rating, girl.total.ratings, girl.published, girl.editions))
girl$girl.title <- as.character(girl$girl.title)
girl$girl.avg.rating <- as.numeric(as.character(girl$girl.avg.rating))
girl$girl.total.ratings <- as.numeric(as.character(girl$girl.total.ratings))
girl$girl.published <- as.numeric(as.character(girl$girl.published))
girl$girl.editions <- as.numeric(as.character(girl$girl.editions))
girl[is.na(girl)] <- 0
girl[grepl("girl", girl$girl.title) | grepl("Girl", girl$girl.title), ]
girl$girl.title[girl$girl.title == 'The Diary of a Young Girl'] <- 'The Diary of a Young Girl (The Diary of Anne Frank)'
girl <- subset(girl, girl.title != 'Romeo and Juliet: Shakespeare Girl Edition') # exclude
boy <- as.data.frame(cbind(boy.title, boy.avg.rating, boy.total.ratings, boy.published, boy.editions))
boy$boy.title <- as.character(boy$boy.title)
boy$boy.avg.rating <- as.numeric(as.character(boy$boy.avg.rating))
boy$boy.total.ratings <- as.numeric(as.character(boy$boy.total.ratings))
boy$boy.published <- as.numeric(as.character(boy$boy.published))
boy$boy.editions <- as.numeric(as.character(boy$boy.editions))
boy$boy.published <- ifelse(boy$boy.title == "Dr. Seuss's Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra", 1998, boy.published)
boy$boy.published <- as.numeric(as.character(boy$boy.published))
boy[is.na(boy)] <- 0 boy <- boy[grepl("boy", boy$boy.title) | grepl("Boy", boy$boy.title), ]
boy <- subset(boy, boy.title != 'Göçebe: Cep Boy') # Translates to 'The Host'
boy <- subset(boy, boy.title != "The Chronicles Of Narnia : The Magician's Nephew, The Lion The Witch and The Wardrobe, The Horse and His Boy, Prince Caspian, The Voyage of The Dawn Treader, The Silver Chair, The Last Battle (The Chronicles of Narnia, #1-7)") # Number 5 is included separatly
boy <- subset(boy, boy.title != "\"A Boy called 'It'\"") # Actual title is "A Child Called It"
Below are the top 10 books with ‘Boy’ or ‘Boys’ in the title and top ten books with ‘Girl’ or ‘Girls’ in the title. As shown, the top books with ‘Girl’ in the title have nearly 6 times the amount of ratings on Goodreads as books with ‘Boy’ in the title. The most recent of these published are ‘The Girl with the Dragon Tattoo’, ‘Gone Girl’, and ‘The Girl on the Train’.
### Analysis ###
# Top 10 "Boy" Books
boy.top10 <- boy %>%
distinct(boy.title, boy.total.ratings, boy.published) %>%
arrange(-boy.total.ratings) %>%
filter(rank(desc(boy.total.ratings)) <= 10)
names(boy.top10)[names(boy.top10) == "boy.title"] <- "Title"
names(boy.top10)[names(boy.top10) == "boy.total.ratings"] <- "Total Ratings"
names(boy.top10)[names(boy.top10) == "boy.published"] <- "Date Published"
kable(boy.top10) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))
Title | Total Ratings | Date Published |
---|---|---|
The Boy in the Striped Pajamas | 318,779 | 2006 |
Where the Red Fern Grows: The Story of Two Dogs and a Boy | 278,208 | 1961 |
Oliver Twist: The Parish Boy’s Progress | 249,995 | 1838 |
Heaven is for Real: A Little Boy’s Astounding Story of His Trip to Heaven and Back | 242,323 | 2010 |
Peter and Wendy: The Boy Who Wouldn’t Grow Up | 194,446 | 1911 |
Lola and the Boy Next Door (Anna and the French Kiss, #2) | 122,083 | 2011 |
A Long Way Gone: Memoirs of a Boy Soldier | 119,382 | 2007 |
About a Boy | 111,520 | 1998 |
Dr. Seuss’s Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra | 90,304 | 1998 |
The Boy Who Sneaks in My Bedroom Window | 54,926 | 2011 |
# Top 10 "Girl" Books
girl.top10 <- girl %>%
distinct(girl.title, girl.total.ratings, girl.published) %>%
arrange(-girl.total.ratings) %>%
filter(rank(desc(girl.total.ratings)) <= 10)
names(girl.top10)[names(girl.top10) == "girl.title"] <- "Title"
names(girl.top10)[names(girl.top10) == "girl.total.ratings"] <- "Total Ratings"
names(girl.top10)[names(girl.top10) == "girl.published"] <- "Date Published"
kable(girl.top10) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))
Title | Total Ratings | Date Published |
---|---|---|
The Diary of a Young Girl (The Diary of Anne Frank) | 1,990,749 | 1947 |
The Girl with the Dragon Tattoo (Millennium, #1) | 1,896,353 | 2005 |
Gone Girl | 159,8463 | 2012 |
Young Girls | 1,300,976 | 1868 |
The Girl on the Train | 1,171,604 | 2015 |
The Girl Who Played with Fire (Millennium, #2) | 640,475 | 2006 |
The Girl Who Kicked the Hornet’s Nest (Millennium, #3) | 521,051 | 2007 |
Girl with a Pearl Earring | 485,129 | 1999 |
The Other Boleyn Girl (The Plantagenet and Tudor Novels, #9) | 396,422 | 2001 |
Kiss the Girls (Alex Cross, #2) | 277,299 | 1995 |
We see that for every year shown on this graph, titles with ‘Girl’ get more reviews than book titles with ‘Boy’ and the side by side comparison gives an idea of how much more popular the titles with ‘Girl’ are in the years when the most popular of those books were published.
# summary statistics by year
girl.table <- girl %>%
group_by(girl.published) %>%
summarise(girl.avg.rating=mean(girl.avg.rating), girl.total.ratings=sum(girl.total.ratings), girl.editions=sum(girl.editions), girl.count=n())
#girl.table
boy.table <- boy %>%
group_by(boy.published) %>%
summarise(boy.avg.rating=mean(boy.avg.rating), boy.total.ratings=sum(boy.total.ratings), boy.editions=sum(boy.editions), boy.count=n())
#boy.table
final.df <- merge(boy.table, girl.table, by.x="boy.published", by.y="girl.published", all=TRUE)
final.df[is.na(final.df)] <- 0
colnames(final.df)[colnames(final.df) == 'boy.published'] <- 'published'
# Total Number of Ratings by Year
ggplot(final.df, aes(x = published)) +
geom_line(aes(y = boy.total.ratings, colour="#00BFC4"), size=1.0) + #blue
geom_line(aes(y = girl.total.ratings, colour = '#F8766D'), size=1.0) + #red
scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
scale_y_continuous(limits = c(0, 2550000), breaks = seq(0,2550000,250000), labels = comma) +
xlab('Year Published') +
ylab('Total Ratings') +
ggtitle('Total Ratings for Books Containing "Boy(s)" or "Girl(s)"') +
theme(axis.line = element_line(size=1, colour = "black"),
panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank()) +
theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
text=element_text(family="Tahoma"),
axis.text.x=element_text(colour="black", size = 10),
axis.text.y=element_text(colour="black", size = 10),
legend.key=element_rect(fill="white", colour="white")) +
scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D")) +
theme(plot.caption=element_text(hjust = 0)) +
labs(caption = "Boy(s)
2005: The Boy in the Striped Pajamas
2014: Heaven is for Real: A Little Boy's Astounding Story of His Trip to Heaven and Back
Girl(s)
2005: The Girl with the Dragon Tattoo
2009: Girl with a Pearl Earring
2012: Gone Girl
2015: The Girl on the Train
")
Goodreads only allows you to scrape the first 100 pages of search results, with each page containing 20 books. Over the years, there has been a drastic increase in the number of books published with both the words ‘Boy’ and ‘Girl’ in the title. With that being said, title’s with ‘Girl’ seem to do much better in regards to popularity. The drop in 2016 may be that 2016 books are not yet popular enough to have been read by lots of people and therefore did not show up on the Goodreads results when searching for ‘Boy’ and ‘Girl’.
# Count of Books
ggplot(final.df, aes(x = published)) +
geom_line(aes(y = boy.count, colour="#00BFC4"), size=1.0) + #blue
geom_line(aes(y = girl.count, colour = '#F8766D'),size=1.0) + #red
scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
scale_y_continuous(limits = c(0, 250), breaks = seq(0, 250, 25), labels = comma) +
xlab('Year Published') +
ylab('Count of Books') +
ggtitle('Total Count of Books Containing "Boy(s)" or "Girl(s)"') +
theme(axis.line = element_line(size=1, colour = "black"),
panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank()) +
theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
text=element_text(family="Tahoma"),
axis.text.x=element_text(colour="black", size = 10),
axis.text.y=element_text(colour="black", size = 10),
legend.key=element_rect(fill="white", colour="white")) +
scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D"))
In regards to average rating for these book on Goodreads, it is almost identical at 3.74 for boys and 3.75 for girls. I would expect that the total average rating for all books on Goodreads is also right around 3.75.
# Average Rating
kable(paste0('Girl: ', round(mean(girl$girl.avg.rating),2), sep=''))
kable(paste0('Boy: ', round(mean(boy$boy.avg.rating),2), sep=''))
Average Rating |
---|
Girl: 3.75 |
Boy: 3.74 |
# Average Rating by Year
ggplot(final.df, aes(x = published)) +
geom_line(aes(y = boy.avg.rating, colour="#00BFC4"), size=1.0) + #blue
geom_line(aes(y = girl.avg.rating, colour = '#F8766D'), size=1.0) + #red
scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
scale_y_continuous(limits = c(3, 4), breaks = seq(3, 4,.1), labels = comma) +
xlab('Year Published') +
ylab('Rating') +
ggtitle('Average Rating of Books Containing "Boy(s)" or "Girl(s)"') +
theme(axis.line = element_line(size=1, colour = "black"),
panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank()) +
theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
text=element_text(family="Tahoma"),
axis.text.x=element_text(colour="black", size = 10),
axis.text.y=element_text(colour="black", size = 10),
legend.key=element_rect(fill="white", colour="white")) +
scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D"))
In conclusion, there have been some extremely popular books as of lately with ‘Girl’ in the title, but this has not been the case with ‘Boy’. Based on the amount of books published with these words, authors seem to be taking a chance with ‘Boy’ because there are nearly just as many of those books being published as ones with ‘Girl’, but they are not nearly as popular. In any case, ‘Girl’ has prevailed over ‘Boy’.