Danny Malter

Data Science Manager - Accenture
M.S. in Predictive Analytics - DePaul University

Danny Malter

Me
Malter Analytics
GitHub
LinkedIn
YouTube Channel
Twitter
Kaggle

Other Work
General Assembly
AriBall

Media
Built In

Goodreads Analysis of Book Titles with Boy and Girl


This analysis is inspired by the FiveThirtyEight post, The Gone Girl With The Dragon Tattoo On The Train, in which the authors asks the question, “why are there so many books with ‘girl’ in the title?”. The goal of this analysis is to replicate the code used to collect reading data from Goodreads.com. To measure popularity, I am assuming that the more often a book is rated on Goodreads, whether a positive or negative rating, it is more popular becasue it means that the book was at least read.

# Load in the required packages
library(rvest)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(scales)
library(knitr)
library(kableExtra)

# initialize tables
girl.title <- NULL
boy.title <- NULL

#girl.author <- NULL
#boy.author <- NULL

girl.rating <- NULL
boy.rating <- NULL

girl.published  <- NULL
boy.published <- NULL

girl.editions  <- NULL
boy.editions <- NULL

# Find URL
for(i in 1:100){
  girl.url <- paste('https://www.goodreads.com/search?page=',i,'&q=girl&search%5Bfield%5D=title&search_type=books&tab=books&utf8=%E2%9C%93', sep='')
  girl.webpage <- read_html(girl.url)
  
  boy.url <- paste('https://www.goodreads.com/search?page=',i,'&q=boy&search%5Bfield%5D=title&search_type=books&tab=books&utf8=%E2%9C%93', sep='')
  boy.webpage <- read_html(boy.url)
  
  ### Title ###
  
  # grab girl title
  girl.title.table <- html_nodes(girl.webpage,'.bookTitle') %>%
    html_text() %>%
    str_replace_all("[\r\n]" , "") %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  girl.title <- rbind(girl.title, girl.title.table)
  
  # grab boy title
  boy.title.table <- html_nodes(boy.webpage,'.bookTitle') %>%
    html_text() %>%
    str_replace_all("[\r\n]" , "") %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  boy.title <- rbind(boy.title, boy.title.table)
  
  # ### Author ###
  # #grab girl author
  # girl.author.table <- html_nodes(girl.webpage,'.tableList') %>%
  #       html_text() %>%
  #       as.data.frame()
  #   
  # #bind to dataframe
  # girl.author <- rbind(girl.author, girl.author.table)
  #   
  # #grab boy author
  # boy.author.table <- html_nodes(boy.webpage,'.tableList') %>%
  # html_text() %>%
  #   as.data.frame()
  # 
  # #bind to dataframe
  # boy.author <- rbind(boy.author, boy.author.table)
  
  ### Rating ###
  
  # grab firl rating
  girl.rating.table <- html_nodes(girl.webpage,'.minirating') %>%
    html_text() %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  girl.rating <- rbind(girl.rating, girl.rating.table)
  
  # grab boy rating
  boy.rating.table <- html_nodes(boy.webpage,'.minirating') %>%
    html_text() %>%
    str_trim(side = "both") %>%
    as.data.frame()
  
  # bind to dataframe
  boy.rating <- rbind(boy.rating, boy.rating.table)
  
  
  ### Published ###
  
  # grab girl published
  girl.published.table <- html_nodes(girl.webpage,'.uitext') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  girl.published <- rbind(girl.published, girl.published.table)
  
  # grab boy published
  boy.published.table <- html_nodes(boy.webpage,'.uitext') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  boy.published <- rbind(boy.published, boy.published.table)
  
  
  ### Editions ###
  
  # grab girl editions
  girl.editions.table <- html_nodes(girl.webpage,'.greyText a') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  girl.editions <- rbind(girl.editions, girl.editions.table)
  
  # grab boy editions
  boy.editions.table <- html_nodes(boy.webpage,'.greyText a') %>%
    html_text() %>%
    as.data.frame()
  
  # bind to dataframe
  boy.editions <- rbind(boy.editions, boy.editions.table)
}

# clean title
names(girl.title)[1] <- "girl.title"
names(boy.title)[1] <- "boy.title"

# clean author
# names(girl.author)[1] <- "author"
# names(boy.author)[1] <- "author"
# 
# girl.author <- str_replace_all(girl.author$author, "[\r\n]" , "")
# girl.author <- data.frame( do.call( cbind, strsplit( girl.author, 'by' ) ) ) 
# girl.author <- girl.author[-1,]  # remove first row
# girl.author <- gather(girl.author)
# names(girl.author)[names(girl.author) == 'value'] <- 'author'
# girl.author <- sub('avg rating — .*', '', girl.author$author)
# girl.author <- gsub('[[:digit:]]+', '', girl.author)
# girl.author <- gsub("[.]","",girl.author) 
# girl.author <- sub(".(Goodreads Author).*", " \\", girl.author)
# girl.author <- sub(").*", " \\", girl.author)
# girl.author <- str_trim(girl.author, side = "both")
# girl.author <- as.data.frame(girl.author)
# girl.author <- girl.author[!apply(girl.author == "", 1, all),]
# girl.author <- gsub(",.*$", "", girl.author)
# 
# boy.author <- str_replace_all(boy.author$author, "[\r\n]" , "")
# boy.author <- data.frame( do.call( cbind, strsplit( boy.author, 'by' ) ) ) 
# boy.author <- boy.author[-1,]  # remove first row
# boy.author <- gather(boy.author)
# names(boy.author)[names(boy.author) == 'value'] <- 'author'
# boy.author <- sub('avg rating — .*', '', boy.author$author)
# boy.author <- gsub('[[:digit:]]+', '', boy.author)
# boy.author <- gsub("[.]","",boy.author) 
# boy.author <- sub(".(Goodreads Author).*", " \\", boy.author)
# boy.author <- sub(").*", " \\", boy.author)
# boy.author <- str_trim(boy.author, side = "both")
# boy.author <- as.data.frame(boy.author)
# boy.author <- boy.author[!apply(boy.author == "", 1, all),]
# boy.author <- gsub(",.*$", "", boy.author)

# clean rating
names(girl.rating)[1] <- "rating"
names(boy.rating)[1] <- "rating"

girl.avg.rating <- sub(' avg rating.*', '', girl.rating$rating)
girl.total.ratings <- sub('.*rating — ', '', girl.rating$rating)
girl.total.ratings <- sub(' .*', '', girl.total.ratings)
girl.total.ratings <- gsub(',', '', girl.total.ratings)

boy.avg.rating <- sub(' avg rating.*', '', boy.rating$rating)
boy.total.ratings <- sub('.*rating — ', '', boy.rating$rating)
boy.total.ratings <- sub(' .*', '', boy.total.ratings)
boy.total.ratings <- gsub(',', '', boy.total.ratings)

# clean published
names(girl.published)[1] <- "published"
names(boy.published)[1] <- "published"

girl.published <- gsub('*\n[A-z ]*', '' , girl.published$published)
girl.published <- girl.published[lapply(girl.published,function(x) length(grep("Clear rating",x,value=FALSE))) == 0]
girl.published <- girl.published[lapply(girl.published,function(x) length(grep("Rate this book",x,value=FALSE))) == 0]
girl.published <- sub('.*ratings—', '', girl.published)
girl.published <- sub('—.*', '', girl.published)
girl.published <- ifelse(grepl("edition", girl.published), NA, girl.published) # some books don't contain published years
girl.published <- ifelse(grepl("rating", girl.published), NA, girl.published)

boy.published <- gsub('*\n[A-z ]*', '' , boy.published$published)
boy.published <- boy.published[lapply(boy.published,function(x) length(grep("Clear rating",x,value=FALSE))) == 0]
boy.published <- boy.published[lapply(boy.published,function(x) length(grep("Rate this book",x,value=FALSE))) == 0]
boy.published <- sub('.*ratings—', '', boy.published)
boy.published <- sub('—.*', '', boy.published)
boy.published <- ifelse(grepl("edition", boy.published), NA, boy.published)  # some books don't contain published years
boy.published <- ifelse(grepl("rating", boy.published), NA, boy.published)

# clean editions
names(girl.editions)[1] <- "editions"
names(boy.editions)[1] <- "editions"

girl.editions <- sub(' .*', '', girl.editions$editions)
boy.editions <- sub(' .*', '', boy.editions$editions)


### Combine into Dataframe ###

girl <- as.data.frame(cbind(girl.title, girl.avg.rating, girl.total.ratings, girl.published, girl.editions))
girl$girl.title <- as.character(girl$girl.title)
girl$girl.avg.rating <- as.numeric(as.character(girl$girl.avg.rating))
girl$girl.total.ratings <- as.numeric(as.character(girl$girl.total.ratings))
girl$girl.published <- as.numeric(as.character(girl$girl.published))
girl$girl.editions <- as.numeric(as.character(girl$girl.editions))
girl[is.na(girl)] <- 0
girl[grepl("girl", girl$girl.title) | grepl("Girl", girl$girl.title), ]
girl$girl.title[girl$girl.title == 'The Diary of a Young Girl'] <- 'The Diary of a Young Girl (The Diary of Anne Frank)'
girl <- subset(girl, girl.title != 'Romeo and Juliet: Shakespeare Girl Edition')  # exclude

boy <- as.data.frame(cbind(boy.title, boy.avg.rating, boy.total.ratings, boy.published, boy.editions))
boy$boy.title <- as.character(boy$boy.title)
boy$boy.avg.rating <- as.numeric(as.character(boy$boy.avg.rating))
boy$boy.total.ratings <- as.numeric(as.character(boy$boy.total.ratings))
boy$boy.published <- as.numeric(as.character(boy$boy.published))
boy$boy.editions <- as.numeric(as.character(boy$boy.editions))
boy$boy.published <- ifelse(boy$boy.title == "Dr. Seuss's Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra", 1998, boy.published)
                               boy$boy.published <- as.numeric(as.character(boy$boy.published))
boy[is.na(boy)] <- 0 boy <- boy[grepl("boy", boy$boy.title) | grepl("Boy", boy$boy.title), ]
boy <- subset(boy, boy.title != 'Göçebe: Cep Boy')  # Translates to 'The Host'
boy <- subset(boy, boy.title != "The Chronicles Of Narnia : The Magician's Nephew, The Lion The Witch and The Wardrobe, The Horse and His Boy, Prince Caspian, The Voyage of The Dawn Treader, The Silver Chair, The Last Battle (The Chronicles of Narnia, #1-7)")  # Number 5 is included separatly
boy <- subset(boy, boy.title != "\"A Boy called 'It'\"")  # Actual title is "A Child Called It"

Below are the top 10 books with ‘Boy’ or ‘Boys’ in the title and top ten books with ‘Girl’ or ‘Girls’ in the title. As shown, the top books with ‘Girl’ in the title have nearly 6 times the amount of ratings on Goodreads as books with ‘Boy’ in the title. The most recent of these published are ‘The Girl with the Dragon Tattoo’, ‘Gone Girl’, and ‘The Girl on the Train’.


Top 10 Books with ‘Boy’ or ‘Boys’ in the Title


### Analysis ###

# Top 10 "Boy" Books
boy.top10 <- boy %>%
  distinct(boy.title, boy.total.ratings, boy.published) %>%
  arrange(-boy.total.ratings) %>%
  filter(rank(desc(boy.total.ratings)) <= 10)

names(boy.top10)[names(boy.top10) == "boy.title"] <- "Title"
names(boy.top10)[names(boy.top10) == "boy.total.ratings"] <- "Total Ratings"
names(boy.top10)[names(boy.top10) == "boy.published"] <- "Date Published"

kable(boy.top10) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))
                                                                                               
Title Total Ratings Date Published
The Boy in the Striped Pajamas 318,779 2006
Where the Red Fern Grows: The Story of Two Dogs and a Boy 278,208 1961
Oliver Twist: The Parish Boy’s Progress 249,995 1838
Heaven is for Real: A Little Boy’s Astounding Story of His Trip to Heaven and Back 242,323 2010
Peter and Wendy: The Boy Who Wouldn’t Grow Up 194,446 1911
Lola and the Boy Next Door (Anna and the French Kiss, #2) 122,083 2011
A Long Way Gone: Memoirs of a Boy Soldier 119,382 2007
About a Boy 111,520 1998
Dr. Seuss’s Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra 90,304 1998
The Boy Who Sneaks in My Bedroom Window 54,926 2011


Top 10 Books with ‘Girl’ or ‘Girls’ in the Title


# Top 10 "Girl" Books
girl.top10 <- girl %>%
  distinct(girl.title, girl.total.ratings, girl.published) %>%
  arrange(-girl.total.ratings) %>%
  filter(rank(desc(girl.total.ratings)) <= 10)

names(girl.top10)[names(girl.top10) == "girl.title"] <- "Title"
names(girl.top10)[names(girl.top10) == "girl.total.ratings"] <- "Total Ratings"
names(girl.top10)[names(girl.top10) == "girl.published"] <- "Date Published"

kable(girl.top10) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"))
                                                                                               
Title Total Ratings Date Published
The Diary of a Young Girl (The Diary of Anne Frank) 1,990,749 1947
The Girl with the Dragon Tattoo (Millennium, #1) 1,896,353 2005
Gone Girl 159,8463 2012
Young Girls 1,300,976 1868
The Girl on the Train 1,171,604 2015
The Girl Who Played with Fire (Millennium, #2) 640,475 2006
The Girl Who Kicked the Hornet’s Nest (Millennium, #3) 521,051 2007
Girl with a Pearl Earring 485,129 1999
The Other Boleyn Girl (The Plantagenet and Tudor Novels, #9) 396,422 2001
Kiss the Girls (Alex Cross, #2) 277,299 1995


Total Ratings on goodreads.com

We see that for every year shown on this graph, titles with ‘Girl’ get more reviews than book titles with ‘Boy’ and the side by side comparison gives an idea of how much more popular the titles with ‘Girl’ are in the years when the most popular of those books were published.


# summary statistics by year
girl.table <- girl %>% 
  group_by(girl.published) %>%
  summarise(girl.avg.rating=mean(girl.avg.rating), girl.total.ratings=sum(girl.total.ratings), girl.editions=sum(girl.editions), girl.count=n())
#girl.table

boy.table <- boy %>% 
  group_by(boy.published) %>%
  summarise(boy.avg.rating=mean(boy.avg.rating), boy.total.ratings=sum(boy.total.ratings), boy.editions=sum(boy.editions), boy.count=n())
#boy.table

final.df <- merge(boy.table, girl.table, by.x="boy.published", by.y="girl.published", all=TRUE)
final.df[is.na(final.df)] <- 0
colnames(final.df)[colnames(final.df) == 'boy.published'] <- 'published'

# Total Number of Ratings by Year
ggplot(final.df, aes(x = published)) + 
  geom_line(aes(y = boy.total.ratings, colour="#00BFC4"), size=1.0) + #blue
  geom_line(aes(y = girl.total.ratings, colour = '#F8766D'), size=1.0) + #red
  scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
  scale_y_continuous(limits = c(0, 2550000), breaks = seq(0,2550000,250000), labels = comma) +
  xlab('Year Published') + 
  ylab('Total Ratings') +
  ggtitle('Total Ratings for Books Containing "Boy(s)" or "Girl(s)"') + 
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank()) +
  theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 10),
        axis.text.y=element_text(colour="black", size = 10),
        legend.key=element_rect(fill="white", colour="white")) +
  scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D")) + 
  theme(plot.caption=element_text(hjust = 0)) +
  labs(caption = "Boy(s)
2005: The Boy in the Striped Pajamas
2014: Heaven is for Real: A Little Boy's Astounding Story of His Trip to Heaven and Back

Girl(s)
2005: The Girl with the Dragon Tattoo
2009: Girl with a Pearl Earring
2012: Gone Girl
2015: The Girl on the Train
       ")                                                                                      


Count of Books on goodreads.com

Goodreads only allows you to scrape the first 100 pages of search results, with each page containing 20 books. Over the years, there has been a drastic increase in the number of books published with both the words ‘Boy’ and ‘Girl’ in the title. With that being said, title’s with ‘Girl’ seem to do much better in regards to popularity. The drop in 2016 may be that 2016 books are not yet popular enough to have been read by lots of people and therefore did not show up on the Goodreads results when searching for ‘Boy’ and ‘Girl’.

                                                                          # Count of Books
ggplot(final.df, aes(x = published)) + 
  geom_line(aes(y = boy.count, colour="#00BFC4"), size=1.0) + #blue
  geom_line(aes(y = girl.count, colour = '#F8766D'),size=1.0) + #red
  scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
  scale_y_continuous(limits = c(0, 250), breaks = seq(0, 250, 25), labels = comma) +
  xlab('Year Published') + 
  ylab('Count of Books') +
  ggtitle('Total Count of Books Containing "Boy(s)" or "Girl(s)"') + 
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank()) +
  theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 10),
        axis.text.y=element_text(colour="black", size = 10),
        legend.key=element_rect(fill="white", colour="white")) +
  scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D"))


Average Rating by Year

In regards to average rating for these book on Goodreads, it is almost identical at 3.74 for boys and 3.75 for girls. I would expect that the total average rating for all books on Goodreads is also right around 3.75.


# Average Rating
kable(paste0('Girl: ', round(mean(girl$girl.avg.rating),2), sep='')) 
kable(paste0('Boy: ', round(mean(boy$boy.avg.rating),2), sep=''))                              
Average Rating
Girl: 3.75
Boy: 3.74

# Average Rating by Year
ggplot(final.df, aes(x = published)) + 
  geom_line(aes(y = boy.avg.rating, colour="#00BFC4"), size=1.0) + #blue
  geom_line(aes(y = girl.avg.rating, colour = '#F8766D'), size=1.0) + #red
  scale_x_continuous(limits = c(1984, 2016), breaks = seq(1984,2016,4)) +
  scale_y_continuous(limits = c(3, 4), breaks = seq(3, 4,.1), labels = comma) +
  xlab('Year Published') + 
  ylab('Rating') +
  ggtitle('Average Rating of Books Containing "Boy(s)" or "Girl(s)"') + 
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"), panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank()) +
  theme(plot.title = element_text(size = 14, hjust = 0.5, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 10),
        axis.text.y=element_text(colour="black", size = 10),
        legend.key=element_rect(fill="white", colour="white")) +
  scale_color_manual(name = '', labels = c("Boy(s)", "Girl(s)"), values = c("#00BFC4", "#F8766D"))


In conclusion, there have been some extremely popular books as of lately with ‘Girl’ in the title, but this has not been the case with ‘Boy’. Based on the amount of books published with these words, authors seem to be taking a chance with ‘Boy’ because there are nearly just as many of those books being published as ones with ‘Girl’, but they are not nearly as popular. In any case, ‘Girl’ has prevailed over ‘Boy’.


comments powered by Disqus