# merge on factor column

library("forcats")
library("data.table")
library("magrittr")

I recently started working with factors in R and found the forcats package amazing. However, I did not found much documentation on merging two data frames on a factor column so I made small examples and observed the behavior.

## same levels, different order

In abc order: natural if converting from already ordered character.

dt_abc <- data.table(size = as_factor(c("big", "medium", "tiny")))
levels(dt_abc$size) ##  "big" "medium" "tiny" We can define the natural order: dt_lg <- data.table(size = fct_reorder(as_factor(c("big", "medium", "tiny")), c(3, 2, 1))) levels(dt_lg$size)
##  "tiny"   "medium" "big"
merge(dt_abc, dt_lg, by = "size") %T>%
{print(levels(.$size))} ##  "big" "medium" "tiny" ## size ## 1: big ## 2: medium ## 3: tiny merge(dt_lg, dt_abc, by = "size") %T>% {print(levels(.$size))}
##  "tiny"   "medium" "big"
##      size
## 1:   tiny
## 2: medium
## 3:    big

From this it seems that with merge the left table has precedence.

## different levels

dt_abc <- data.table(size = as_factor(c("big", "medium", "small", "tiny")))
dt_lg <- data.table(size = fct_reorder(as_factor(c("big", "medium", "tiny")), c(3, 2, 1)))
merge(dt_abc, dt_lg, by = "size", all = TRUE) %T>%
{print(levels(.$size))} ##  "big" "medium" "small" "tiny" ## size ## 1: big ## 2: medium ## 3: small ## 4: tiny merge(dt_lg, dt_abc, by = "size", all = TRUE) %T>% {print(levels(.$size))}
##  "tiny"   "medium" "big"    "small"
##      size
## 1:   tiny
## 2: medium
## 3:    big
## 4:  small

In this case the levels of the factors in the two table are concatenated: first left then right.

## merge factor with character

dt_chr <- data.table(size = c("big", "medium", "tiny"))
dt_fct <- data.table(size = fct_reorder(as_factor(c("big", "medium", "tiny")), c(3, 2, 1)))
merge(dt_fct, dt_chr, by = "size") %T>%
{print(levels(.$size))} ## NULL ## size ## 1: big ## 2: medium ## 3: tiny merge(dt_chr, dt_fct, by = "size") %T>% {print(levels(.$size))}
## NULL
##      size
## 1:    big
## 2: medium
## 3:   tiny

From this it seems that in this case the right side of the join has precedence.