forcats: Working with Categorical Data

Introduction


In this module, we will learn to:

  • tabulate levels
  • reorder levels
  • reverse levels
  • collapse levels
  • recode levels
  • recategorize levels
  • shift levels

Case Study


We will use a case study to explore the various features of the forcats package. You can download the data for the case study from here or directly import the data using the readr package. In this case study, we will:

  • compute the frequency of different referrers
  • collapse referrers with low sample size into a single group
  • club traffic from social media websites into a new category
  • group referrers with traffic below a threshold into a single category

Data


traffic <- read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/web_traffic.csv',
  col_types = list(col_factor(levels = c("google", "facebook", "affiliates", 
    "bing", "yahoo", "twitter", "instagram", "unknown", "direct"))))

traffic

Data


traffics <- 
  traffic %>%
  use_series(traffics)

traffics

Count


fct_count(traffics)

Levels


levels(traffics)

Reorder


Reorder


traffics %>%
  fct_infreq() %>%
  levels()

Reorder


Reorder


traffics %>%
  fct_inorder() %>%
  levels()

Reverse Levels


Reverse Levels


traffics %>%
  fct_rev() %>%
  levels()

Collapse Categories


Collapse Categories


traffics %>% 
  fct_collapse(
  social = c("facebook", "twitter", "instagram"),
  search = c("google", "bing", "yahoo")) %>% 
  fct_count() 

Lump Categories


Lump Categories


fct_count(traffics)
traffics %>% 
  fct_lump() %>% 
  table()

Lump Categories


traffics %>% 
  fct_count() %>% 
  arrange(desc(n))
traffics %>% 
  fct_lump(n = 3) %>% 
  table()

Lump Categories


Lump Categories


traffics %>% 
  fct_count() %>%
  mutate(
    percent = round((n / sum(n)) * 100, 2)
  )
traffics %>% 
  fct_lump(prop = 0.1) %>% 
  table()

Lump Categories


traffics %>% 
  fct_count() %>%
  mutate(
    percent = round((n / sum(n)) * 100, 2)
  )
traffics %>% 
  fct_lump(prop = 0.15) %>% 
  table()

Lump Categories


traffics %>% 
  fct_count() %>% 
  arrange(n)
traffics %>% 
  fct_lump(n = -3) %>% 
  table()

Lump Categories


traffics %>% 
  fct_count() %>%
  mutate(
    percent = round((n / sum(n)) * 100, 2)
  )
traffics %>% 
  fct_lump(prop = -0.1) %>% 
  table()

Replace Levels


Replace Levels with Other


fct_other(traffics, keep = c("google", "yahoo")) %>%
  levels()

Replace Levels


Replace Levels with Other


fct_other(traffics, drop = c("instagram", "twitter")) %>%
  levels()

Recode Levels


Recode Levels


fct_recode(traffics, search = "bing", search = "yahoo", search = "google",
  social = "facebook", social = "twitter", social = "instagram") %>%
  levels()

Reorder Levels


Reorder Levels


fct_relevel(traffics, "twitter") %>%
  levels()

Reorder Levels


Reorder Levels


fct_relevel(traffics, "google", after = 2) %>%
  levels()

Reorder Levels


Reorder Levels


fct_relevel(traffics, "facebook", after = Inf) %>%
  levels()

Data


response <- read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/response.csv',
  col_types = list(col_factor(levels = c("like", "like somewhat", "neutral", 
    "dislike somewhat", "dislike"), ordered = TRUE)))

response

Data


responses <- 
  response %>%
  use_series(response)

responses

Shift Levels


Shift Levels


fct_shift(responses, 2) %>%
  levels()

Shift Levels


Shift Levels


fct_shift(responses, -2) %>%
  levels()