2

So I have two dataframes containing ";" separated IDs. I want to see which IDs in df2$Protein.IDs are in df1$subunits.UniProt.IDs., and ideally return the matching IDs as a list. I'm not sure how to approach this as I haven't come across anything in previous questions. I'm trying to use strsplit separate the IDs for each entry, then search for matches for each one, with the goal of iterating across the df by a for loop, but I'm a bit stuck on how to make it come together.

df1 <- structure(list(ComplexName = c("Oligosaccharyltransferase complex (Stt3B variant)", 
"TRF1-TIN2 complex", "TRF1 telomere length regulation complex", 
"Rap1 complex", "TRF2-Rap1 complex III", "TRF-Rap1 complex I, 2MD"
), Organism = c("Mouse", "Mouse", "Mouse", "Mouse", "Mouse", 
"Mouse"), subunits.UniProt.IDs. = c("Q91YQ5;Q9DBG6;O54734;P61804;Q8BTV1;Q3TDQ1;Q9CQY5", 
"P70371;Q5EE38;Q9QXG9;Q91WC1", "Q6PFX9;P70371;Q9QXG9", "P70371;O35144;P70388;Q5EE38;Q9QXG9;Q91WC1;Q91VL8", 
"O35144;Q91VL8", "Q6PFX9;P70371;O35144;Q9QXG9;Q91WC1;Q91VL8")), row.names = 881:886, class = "data.frame")
df2 <- structure(list(Protein.IDs = c("E9QN37;A1L314", "A2A6U3;Q80UG5-2;Q80UG5-3;Q80UG5;A2A6U5;A8Y5D3", 
"A2A7A7;Q8CFX1;A2A7A8", "A6PW84;A2A7Q5;Q3V1T4-3;Q3V1T4", "A2ACG7;Q9DBG6", 
"F6TBV1;F6WHL0;A2ADH1;Q9CQY5-2;Q9CQY5-3;Q9CQY5", "A2AEM2;Q99LI2", 
"A2AFS0;Q8C483;P26638;A2AFS1", "A2AFW6;Q9D050;Q791V5;D6RCZ1", 
"Q3UB58;Q3UNJ0;A2AIN5;Q3UZ35;P21855", "A2AIW9;Q9DC61", "A2AL36;E9Q9E7;A2AS42;F6QP53;A2AL36-2", 
"A2AL50;Q8C0I1;H3BKN2;A2AL49;H3BIY5", "Q8CBM2;A2AL85;Q8BSY0", 
"A2AMH3;A2AMH5;A2AMH4;Q6X893;Q6X893-2;A2AMH8", "A2AMW0;P47757-2;A2AMV7;P47757;F6QJN8;F6YHZ8;F7CAZ6", 
"Q3U8S1;A2APM5;A2APM3;A2APM4;E9QKM8;Q80X37;A2APM1;A2APM2;P15379-2;P15379-3;P15379-6;P15379-11;P15379-5;P15379-10;P15379-9;P15379-4;P15379-8;P15379-7;P15379;P15379-12;P15379-13", 
"A2AR26;Q3UDF0;A2AR27", "E9Q8N1;E9Q8K5;A2ASS6;A2ASS6-2;A2AT70;F7CR78", 
"A2AUR7;Q9D031;Q01730"), Gene.names = c("Mpeg1", "Sep-09", "H6pd", 
"P3h1;Lepre1", "Rpn2", "Magt1", "Clcc1", "Sars", "Mtch2", "Cd72", 
"Pmpca", "Cntrl", "Agps", "Asph", "Slc44a1", "Capzb", "Cd44", 
"Slc2a6", "Ttn", "Rsu1")), row.names = c(NA, 20L), class = "data.frame")

Thanks!

0

2 Answers 2

4

Assuming, the column is added in 'df2', we can use strsplit from base R to split both the relevant columns in the datasets, use intersect to find the common elements in each element of the list

un1 <- unique(unlist(strsplit(df1$subunits.UniProt.IDs., ";")))
df2$Common <- lapply(strsplit(df2$Protein.IDs, ";"), intersect, un1)
0
4

You could use dplyr, tidyr and stringr (or simply tidyverse):

library(dplyr)
library(tidyr)
library(stringr)

df2 %>% 
  mutate(Protein = str_split(Protein.IDs, ";")) %>% 
  unnest(Protein) %>% 
  inner_join(df1 %>% 
              mutate(Protein = str_split(subunits.UniProt.IDs., ";")) %>% 
              unnest(Protein), by="Protein")

which returns

# A tibble: 2 x 6
  Protein.IDs                Gene.names Protein ComplexName                Organism subunits.UniProt.IDs.        
  <chr>                      <chr>      <chr>   <chr>                      <chr>    <chr>                        
1 A2ACG7;Q9DBG6              Rpn2       Q9DBG6  Oligosaccharyltransferase~ Mouse    Q91YQ5;Q9DBG6;O54734;P61804;~
2 F6TBV1;F6WHL0;A2ADH1;Q9CQ~ Magt1      Q9CQY5  Oligosaccharyltransferase~ Mouse    Q91YQ5;Q9DBG6;O54734;P61804;~

We splitted the Protein.IDs / subunits.UniProt.IDs. at ; and inner joined both tables by the resulting column.

Since both transformations for df1 and df2 are basically the same, you can wrap it into a function. This is useful, if you have to process several more data.frames:

split_function <- function(df, column, new_col) {
  df %>% 
    mutate({{new_col}} := str_split({{column}}, ";")) %>% 
    unnest({{new_col}})
}

inner_join(split_function(df1, subunits.UniProt.IDs., Protein), 
           split_function(df2, Protein.IDs, Protein),
           by = "Protein")

Not the answer you're looking for? Browse other questions tagged or ask your own question.