danmalter.github.io - Personal GitHub site for Danny Malter

Goodreads Analysis of Book Titles with Boy and Girl

This analysis is inspired by the FiveThirtyEight post, The Gone Girl With The Dragon Tattoo On The Train, in which the authors asks the question, “why are there so many books with ‘girl’ in the title?”. The goal of this analysis is to replicate the code used to collect reading data from Goodreads.com. To measure popularity, I am assuming that the more often a book is rated on Goodreads, whether a positive or negative rating, it is more popular becasue it means that the book was at least read.

# Load in the required packages
library(rvest)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(scales)
library(knitr)
library(kableExtra)

# initialize tables
girl.title <- NULL
boy.title <- NULL

#girl.author <- NULL
#boy.author <- NULL

girl.rating <- NULL
boy.rating <- NULL

girl.published  <- NULL
boy.published <- NULL

girl.editions  <- NULL
boy.editions <- NULL

# Find URL
for(i in 1:100){
  girl.url <- paste('https://www.goodreads.com/search?page=',i,'&q=girl&search%5Bfield%5D=title&search_type=books&tab=books&utf8=%E2%9C%93', sep='')
  girl.webpage <- read_html(girl.url)
  
  boy.url <- paste('https://www.goodreads.com/search?page=',i,'&q=boy&search%5Bfield%5D=title&search_type=books&tab=books&utf8=%E2%9C%93', sep='')
  boy.webpage <- read_html(boy.url)
  
  ### Title ###
  
  # grab girl title
  girl.title.table <- html_nodes(girl.webpage,'.bookTitle') %>%
    html_text() %>%
    str_replace_all("[\r\n]" , "") %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  girl.title <- rbind(girl.title, girl.title.table)
  
  # grab boy title
  boy.title.table <- html_nodes(boy.webpage,'.bookTitle') %>%
    html_text() %>%
    str_replace_all("[\r\n]" , "") %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  boy.title <- rbind(boy.title, boy.title.table)
  
  # ### Author ###
  # #grab girl author
  # girl.author.table <- html_nodes(girl.webpage,'.tableList') %>%
  #       html_text() %>%
  #       as.data.frame()
  #   
  # #bind to dataframe
  # girl.author <- rbind(girl.author, girl.author.table)
  #   
  # #grab boy author
  # boy.author.table <- html_nodes(boy.webpage,'.tableList') %>%
  # html_text() %>%
  #   as.data.frame()
  # 
  # #bind to dataframe
  # boy.author <- rbind(boy.author, boy.author.table)
  
  ### Rating ###
  
  # grab firl rating
  girl.rating.table <- html_nodes(girl.webpage,'.minirating') %>%
    html_text() %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  girl.rating <- rbind(girl.rating, girl.rating.table)
  
  # grab boy rating
  boy.rating.table <- html_nodes(boy.webpage,'.minirating') %>%
    html_text() %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  boy.rating <- rbind(boy.rating, boy.rating.table)
  
  
  ### Published ###
  
  # grab girl published
  girl.published.table <- html_nodes(girl.webpage,'.uitext') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  girl.published <- rbind(girl.published, girl.published.table)
  
  # grab boy published
  boy.published.table <- html_nodes(boy.webpage,'.uitext') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  boy.published <- rbind(boy.published, boy.published.table)
  
  
  ### Editions ###
  
  # grab girl editions
  girl.editions.table <- html_nodes(girl.webpage,'.greyText a') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  girl.editions <- rbind(girl.editions, girl.editions.table)
  
  # grab boy editions
  boy.editions.table <- html_nodes(boy.webpage,'.greyText a') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  boy.editions <- rbind(boy.editions, boy.editions.table)
}

# clean title
names(girl.title)[1] <- "girl.title"
names(boy.title)[1] <- "boy.title"

# clean author
# names(girl.author)[1] <- "author"
# names(boy.author)[1] <- "author"
# 
# girl.author <- str_replace_all(girl.author$author, "[\r\n]" , "")
# girl.author <- data.frame( do.call( cbind, strsplit( girl.author, 'by' ) ) ) 
# girl.author <- girl.author[-1,]  # remove first row
# girl.author <- gather(girl.author)
# names(girl.author)[names(girl.author) == 'value'] <- 'author'
# girl.author <- sub('avg rating — .*', '', girl.author$author)
# girl.author <- gsub('[[:digit:]]+', '', girl.author)
# girl.author <- gsub("[.]","",girl.author) 
# girl.author <- sub(".(Goodreads Author).*", " \\", girl.author)
# girl.author <- sub(").*", " \\", girl.author)
# girl.author <- str_trim(girl.author, side = "both")
# girl.author <- as.data.frame(girl.author)
# girl.author <- girl.author[!apply(girl.author == "", 1, all),]
# girl.author <- gsub(",.*$", "", girl.author)
# 
# boy.author <- str_replace_all(boy.author$author, "[\r\n]" , "")
# boy.author <- data.frame( do.call( cbind, strsplit( boy.author, 'by' ) ) ) 
# boy.author <- boy.author[-1,]  # remove first row
# boy.author <- gather(boy.author)
# names(boy.author)[names(boy.author) == 'value'] <- 'author'
# boy.author <- sub('avg rating — .*', '', boy.author$author)
# boy.author <- gsub('[[:digit:]]+', '', boy.author)
# boy.author <- gsub("[.]","",boy.author) 
# boy.author <- sub(".(Goodreads Author).*", " \\", boy.author)
# boy.author <- sub(").*", " \\", boy.author)
# boy.author <- str_trim(boy.author, side = "both")
# boy.author <- as.data.frame(boy.author)
# boy.author <- boy.author[!apply(boy.author == "", 1, all),]
# boy.author <- gsub(",.*$", "", boy.author)

# clean rating
names(girl.rating)[1] <- "rating"
names(boy.rating)[1] <- "rating"

girl.avg.rating <- sub(' avg rating.*', '', girl.rating$rating)
girl.total.ratings <- sub('.*rating — ', '', girl.rating$rating)
girl.total.ratings <- sub(' .*', '', girl.total.ratings)
girl.total.ratings <- gsub(',', '', girl.total.ratings)

boy.avg.rating <- sub(' avg rating.*', '', boy.rating$rating)
boy.total.ratings <- sub('.*rating — ', '', boy.rating$rating)
boy.total.ratings <- sub(' .*', '', boy.total.ratings)
boy.total.ratings <- gsub(',', '', boy.total.ratings)

# clean published
names(girl.published)[1] <- "published"
names(boy.published)[1] <- "published"

girl.published <- gsub('*\n[A-z ]*', '' , girl.published$published)
girl.published <- girl.published[lapply(girl.published,function(x) length(grep("Clear rating",x,value=FALSE))) == 0]
girl.published <- girl.published[lapply(girl.published,function(x) length(grep("Rate this book",x,value=FALSE))) == 0]
girl.published <- sub('.*ratings—', '', girl.published)
girl.published <- sub('—.*', '', girl.published)
girl.published <- ifelse(grepl("edition", girl.published), NA, girl.published) # some books don't contain published years
girl.published <- ifelse(grepl("rating", girl.published), NA, girl.published)

boy.published <- gsub('*\n[A-z ]*', '' , boy.published$published)
boy.published <- boy.published[lapply(boy.published,function(x) length(grep("Clear rating",x,value=FALSE))) == 0]
boy.published <- boy.published[lapply(boy.published,function(x) length(grep("Rate this book",x,value=FALSE))) == 0]
boy.published <- sub('.*ratings—', '', boy.published)
boy.published <- sub('—.*', '', boy.published)
boy.published <- ifelse(grepl("edition", boy.published), NA, boy.published)  # some books don't contain published years
boy.published <- ifelse(grepl("rating", boy.published), NA, boy.published)

# clean editions
names(girl.editions)[1] <- "editions"
names(boy.editions)[1] <- "editions"

girl.editions <- sub(' .*', '', girl.editions$editions)
boy.editions <- sub(' .*', '', boy.editions$editions)


### Combine into Dataframe ###

girl <- as.data.frame(cbind(girl.title, girl.avg.rating, girl.total.ratings, girl.published, girl.editions))
girl$girl.title <- as.character(girl$girl.title)
girl$girl.avg.rating <- as.numeric(as.character(girl$girl.avg.rating))
girl$girl.total.ratings <- as.numeric(as.character(girl$girl.total.ratings))
girl$girl.published <- as.numeric(as.character(girl$girl.published))
girl$girl.editions <- as.numeric(as.character(girl$girl.editions))
girl[is.na(girl)] <- 0
girl[grepl("girl", girl$girl.title) | grepl("Girl", girl$girl.title), ]
girl$girl.title[girl$girl.title == 'The Diary of a Young Girl'] <- 'The Diary of a Young Girl (The Diary of Anne Frank)'
girl <- subset(girl, girl.title != 'Romeo and Juliet: Shakespeare Girl Edition')  # exclude

boy <- as.data.frame(cbind(boy.title, boy.avg.rating, boy.total.ratings, boy.published, boy.editions))
boy$boy.title <- as.character(boy$boy.title)
boy$boy.avg.rating <- as.numeric(as.character(boy$boy.avg.rating))
boy$boy.total.ratings <- as.numeric(as.character(boy$boy.total.ratings))
boy$boy.published <- as.numeric(as.character(boy$boy.published))
boy$boy.editions <- as.numeric(as.character(boy$boy.editions))
boy$boy.published <- ifelse(boy$boy.title == "Dr. Seuss's Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra", 1998, boy.published)
                               boy$boy.published <- as.numeric(as.character(boy$boy.published))
boy[is.na(boy)] <- 0 boy <- boy[grepl("boy", boy$boy.title) | grepl("Boy", boy$boy.title), ]
boy <- subset(boy, boy.title != 'Göçebe: Cep Boy')  # Translates to 'The Host'
boy <- subset(boy, boy.title != "The Chronicles Of Narnia : The Magician's Nephew, The Lion The Witch and The Wardrobe, The Horse and His Boy, Prince Caspian, The Voyage of The Dawn Treader, The Silver Chair, The Last Battle (The Chronicles of Narnia, #1-7)")  # Number 5 is included separatly
boy <- subset(boy, boy.title != "\"A Boy called 'It'\"")  # Actual title is "A Child Called It"

Below are the top 10 books with ‘Boy’ or ‘Boys’ in the title and top ten books with ‘Girl’ or ‘Girls’ in the title. As shown, the top books with ‘Girl’ in the title have nearly 6 times the amount of ratings on Goodreads as books with ‘Boy’ in the title. The most recent of these published are ‘The Girl with the Dragon Tattoo’, ‘Gone Girl’, and ‘The Girl on the Train’.

Top 10 Books with ‘Boy’ or ‘Boys’ in the Title


### Analysis ###

# Top 10 "Boy" Books
boy.top10 <- boy %>%
  distinct(boy.title, boy.total.ratings, boy.published) %>%
  arrange(-boy.total.ratings) %>%
  filter(rank(desc(boy.total.ratings)) <= 10)

names(boy.top10)[names(boy.top10) == "boy.title"] <- "Title"
names(boy.top10)[names(boy.top10) == "boy.total.ratings"] <- "Total Ratings"
names(boy.top10)[names(boy.top10) == "boy.published"] <- "Date Published"

kable(boy.top10) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))

Title	Total Ratings	Date Published
The Boy in the Striped Pajamas	318,779	2006
Where the Red Fern Grows: The Story of Two Dogs and a Boy	278,208	1961
Oliver Twist: The Parish Boy’s Progress	249,995	1838
Heaven is for Real: A Little Boy’s Astounding Story of His Trip to Heaven and Back	242,323	2010
Peter and Wendy: The Boy Who Wouldn’t Grow Up	194,446	1911
Lola and the Boy Next Door (Anna and the French Kiss, #2)	122,083	2011
A Long Way Gone: Memoirs of a Boy Soldier	119,382	2007
About a Boy	111,520	1998
Dr. Seuss’s Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra	90,304	1998
The Boy Who Sneaks in My Bedroom Window	54,926	2011

Top 10 Books with ‘Girl’ or ‘Girls’ in the Title


# Top 10 "Girl" Books
girl.top10 <- girl %>%
  distinct(girl.title, girl.total.ratings, girl.published) %>%
  arrange(-girl.total.ratings) %>%
  filter(rank(desc(girl.total.ratings)) <= 10)

names(girl.top10)[names(girl.top10) == "girl.title"] <- "Title"
names(girl.top10)[names(girl.top10) == "girl.total.ratings"] <- "Total Ratings"
names(girl.top10)[names(girl.top10) == "girl.published"] <- "Date Published"

kable(girl.top10) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))

Title	Total Ratings	Date Published
The Diary of a Young Girl (The Diary of Anne Frank)	1,990,749	1947
The Girl with the Dragon Tattoo (Millennium, #1)	1,896,353	2005
Gone Girl	159,8463	2012
Young Girls	1,300,976	1868
The Girl on the Train	1,171,604	2015
The Girl Who Played with Fire (Millennium, #2)	640,475	2006
The Girl Who Kicked the Hornet’s Nest (Millennium, #3)	521,051	2007
Girl with a Pearl Earring	485,129	1999
The Other Boleyn Girl (The Plantagenet and Tudor Novels, #9)	396,422	2001
Kiss the Girls (Alex Cross, #2)	277,299	1995

Total Ratings on goodreads.com

We see that for every year shown on this graph, titles with ‘Girl’ get more reviews than book titles with ‘Boy’ and the side by side comparison gives an idea of how much more popular the titles with ‘Girl’ are in the years when the most popular of those books were published.


# summary statistics by year
girl.table <- girl %>% 
  group_by(girl.published) %>%
  summarise(girl.avg.rating=mean(girl.avg.rating), girl.total.ratings=sum(girl.total.ratings), girl.editions=sum(girl.editions), girl.count=n())
#girl.table

boy.table <- boy %>% 
  group_by(boy.published) %>%
  summarise(boy.avg.rating=mean(boy.avg.rating), boy.total.ratings=sum(boy.total.ratings), boy.editions=sum(boy.editions), boy.count=n())
#boy.table

final.df <- merge(boy.table, girl.table, by.x="boy.published", by.y="girl.published", all=TRUE)
final.df[is.na(final.df)] <- 0
colnames(final.df)[colnames(final.df) == 'boy.published'] <- 'published'

# Total Number of Ratings by Year
ggplot(final.df, aes(x = published)) + 
  geom_line(aes(y = boy.total.ratings, colour="#00BFC4"), size=1.0) + #blue
  geom_line(aes(y = girl.total.ratings, colour = '#F8766D'), size=1.0) + #red
  scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
  scale_y_continuous(limits = c(0, 2550000), breaks = seq(0,2550000,250000), labels = comma) +
  xlab('Year Published') + 
  ylab('Total Ratings') +
  ggtitle('Total Ratings for Books Containing "Boy(s)" or "Girl(s)"') + 
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank()) +
  theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 10),
        axis.text.y=element_text(colour="black", size = 10),
        legend.key=element_rect(fill="white", colour="white")) +
  scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D")) + 
  theme(plot.caption=element_text(hjust = 0)) +
  labs(caption = "Boy(s)
2005: The Boy in the Striped Pajamas
2014: Heaven is for Real: A Little Boy's Astounding Story of His Trip to Heaven and Back

Girl(s)
2005: The Girl with the Dragon Tattoo
2009: Girl with a Pearl Earring
2012: Gone Girl
2015: The Girl on the Train
       ")

Count of Books on goodreads.com

Goodreads only allows you to scrape the first 100 pages of search results, with each page containing 20 books. Over the years, there has been a drastic increase in the number of books published with both the words ‘Boy’ and ‘Girl’ in the title. With that being said, title’s with ‘Girl’ seem to do much better in regards to popularity. The drop in 2016 may be that 2016 books are not yet popular enough to have been read by lots of people and therefore did not show up on the Goodreads results when searching for ‘Boy’ and ‘Girl’.

                                                                          # Count of Books
ggplot(final.df, aes(x = published)) + 
  geom_line(aes(y = boy.count, colour="#00BFC4"), size=1.0) + #blue
  geom_line(aes(y = girl.count, colour = '#F8766D'),size=1.0) + #red
  scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
  scale_y_continuous(limits = c(0, 250), breaks = seq(0, 250, 25), labels = comma) +
  xlab('Year Published') + 
  ylab('Count of Books') +
  ggtitle('Total Count of Books Containing "Boy(s)" or "Girl(s)"') + 
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank()) +
  theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 10),
        axis.text.y=element_text(colour="black", size = 10),
        legend.key=element_rect(fill="white", colour="white")) +
  scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D"))

Average Rating by Year

In regards to average rating for these book on Goodreads, it is almost identical at 3.74 for boys and 3.75 for girls. I would expect that the total average rating for all books on Goodreads is also right around 3.75.


# Average Rating
kable(paste0('Girl: ', round(mean(girl$girl.avg.rating),2), sep='')) 
kable(paste0('Boy: ', round(mean(boy$boy.avg.rating),2), sep=''))

Average Rating
Girl: 3.75
Boy: 3.74


# Average Rating by Year
ggplot(final.df, aes(x = published)) + 
  geom_line(aes(y = boy.avg.rating, colour="#00BFC4"), size=1.0) + #blue
  geom_line(aes(y = girl.avg.rating, colour = '#F8766D'), size=1.0) + #red
  scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
  scale_y_continuous(limits = c(3, 4), breaks = seq(3, 4,.1), labels = comma) +
  xlab('Year Published') + 
  ylab('Rating') +
  ggtitle('Average Rating of Books Containing "Boy(s)" or "Girl(s)"') + 
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank()) +
  theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 10),
        axis.text.y=element_text(colour="black", size = 10),
        legend.key=element_rect(fill="white", colour="white")) +
  scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D"))

In conclusion, there have been some extremely popular books as of lately with ‘Girl’ in the title, but this has not been the case with ‘Boy’. Based on the amount of books published with these words, authors seem to be taking a chance with ‘Boy’ because there are nearly just as many of those books being published as ones with ‘Girl’, but they are not nearly as popular. In any case, ‘Girl’ has prevailed over ‘Boy’.

Goodreads Analysis of Book Titles with ‘Boy’ and ‘Girl’

Top 10 Books with ‘Boy’ or ‘Boys’ in the Title

Top 10 Books with ‘Girl’ or ‘Girls’ in the Title

Total Ratings on goodreads.com

Count of Books on goodreads.com

Average Rating by Year