1
votes

Organiser et mapper une trame de données dans R à l'aide de la fonction de fusion pour le tracé linéaire

J'ai une trame de données, il y a 4 points temporels (lignes) et 18 gènes (colonnes), et dans l'une des lignes, il y a un identifiant associé à chaque gène qui serait utile lors du tracé d'un graphique linéaire dans ggplot2. J'importe les données, puis j'utilise la fonction melt () pour organiser les données du format large au format long. Ce que j'observe, c'est que les ID s'interrompent au milieu de la trame de données. Je veux qu'ils soient disposés dans la dernière colonne, ce qui serait bénéfique lors du tracé en ligne à l'aide de la bibliothèque ggplot2. Veuillez m'aider avec ceci.

Merci,

Toufiq

Importation de données

dput((B1_Test_v1))
structure(list(Timepoints = c(1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 
1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L
), Genes = c("Gene_A", "Gene_A", "Gene_A", "Gene_A", "Gene_B", 
"Gene_B", "Gene_B", "Gene_B", "Gene_C", "Gene_C", "Gene_C", "Gene_C", 
"Gene_D", "Gene_D", "Gene_D", "Gene_D", "Gene_E", "Gene_E", "Gene_E", 
"Gene_E", "Gene_F", "Gene_F", "Gene_F", "Gene_F"), value = c(-2.05066, 
-0.657222, -1.49477, -1.80191, -8.35787, -9.52402, -10.6604, 
-10.516, -2.06287, -0.846725, -1.63796, -1.31922, -3.83545, -1.19723, 
-1.53115, -3.25903, -6.59039, -5.98822, -6.23785, -5.00584, -5.02469, 
-4.41637, -5.46219, -3.97594), ID = c("A1.1", "A1.1", "A1.1", 
"A1.1", "A1.2", "A1.2", "A1.2", "A1.2", "A1.3", "A1.3", "A1.3", 
"A1.3", "A1.4", "A1.4", "A1.4", "A1.4", "A1.5", "A1.5", "A1.5", 
"A1.5", "A1.1", "A1.1", "A1.1", "A1.1")), class = "data.frame", row.names = c(NA, 
-24L))

Utilisation de melt () pour organiser les données du format large au format lon

require(reshape2)
B1_Test_melt <- melt(B1_Test ,  id.vars = 'Timepoints', variable.name = 'Genes')

dput((B1_Test_melt))
structure(list(Timepoints = c("1", "2", "3", "5", "ID", "1", 
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", 
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", 
"1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", 
"3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", 
"ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", 
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", 
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID"
), Genes = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 
12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 
14L, 14L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 17L, 
17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L), .Label = c("Gene_A", 
"Gene_B", "Gene_C", "Gene_D", "Gene_E", "Gene_F", "Gene_G", "Gene_H", 
"Gene_I", "Gene_K", "Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", 
"Gene_R", "Gene_S", "Gene_T"), class = "factor"), value = c("-2.05066", 
"-0.657222", "-1.49477", "-1.80191", "A1.1", "-8.35787", "-9.52402", 
"-10.6604", "-10.516", "A1.2", "-2.06287", "-0.846725", "-1.63796", 
"-1.31922", "A1.3", "-3.83545", "-1.19723", "-1.53115", "-3.25903", 
"A1.4", "-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5", 
"-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1", "-7.75424", 
"-8.17158", "-7.90569", "-8.01352", "A1.6", "-4.65703", "-3.42328", 
"-4.08867", "-3.76642", "A1.2", "-11.7749", "-11.649", "-11.3751", 
"-10.3728", "A1.3", "-4.08981", "-3.09873", "-3.95986", "-3.97249", 
"A1.4", NA, "-19.7923", NA, "-15.1216", "A1.5", "-4.11469", "-3.19647", 
"-3.99615", "-3.06183", "A1.6", "-6.53017", "-6.16685", "-6.865", 
"-6.44303", "A1.9", "-4.58034", "-3.45153", "-4.86697", "-5.25414", 
"A2.2", "-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2", 
"-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6", "-7.73098", 
"-7.20226", "-8.04388", "-7.68191", "A2.6", "-5.09079", "-4.52039", 
"-4.75427", "-5.4321", "A1.9")), row.names = c(NA, -90L), class = "data.frame")

Résultat attendu

B1_Test <- read.csv(file ="./B1_Test.csv", stringsAsFactors = FALSE)

dput(head(B1_Test))
structure(list(Timepoints = c("1", "2", "3", "5", "ID"), Gene_A = c("-2.05066", 
"-0.657222", "-1.49477", "-1.80191", "A1.1"), Gene_B = c("-8.35787", 
"-9.52402", "-10.6604", "-10.516", "A1.2"), Gene_C = c("-2.06287", 
"-0.846725", "-1.63796", "-1.31922", "A1.3"), Gene_D = c("-3.83545", 
"-1.19723", "-1.53115", "-3.25903", "A1.4"), Gene_E = c("-6.59039", 
"-5.98822", "-6.23785", "-5.00584", "A1.5"), Gene_F = c("-5.02469", 
"-4.41637", "-5.46219", "-3.97594", "A1.1"), Gene_G = c("-7.75424", 
"-8.17158", "-7.90569", "-8.01352", "A1.6"), Gene_H = c("-4.65703", 
"-3.42328", "-4.08867", "-3.76642", "A1.2"), Gene_I = c("-11.7749", 
"-11.649", "-11.3751", "-10.3728", "A1.3"), Gene_K = c("-4.08981", 
"-3.09873", "-3.95986", "-3.97249", "A1.4"), Gene_L = c(NA, "-19.7923", 
NA, "-15.1216", "A1.5"), Gene_M = c("-4.11469", "-3.19647", "-3.99615", 
"-3.06183", "A1.6"), Gene_N = c("-6.53017", "-6.16685", "-6.865", 
"-6.44303", "A1.9"), Gene_O = c("-4.58034", "-3.45153", "-4.86697", 
"-5.25414", "A2.2"), Gene_P = c("-3.45614", "-2.72413", "-2.75492", 
"-2.76479", "A2.2"), Gene_R = c("-5.24809", "-4.15782", "-5.28192", 
"-5.72024", "A2.6"), Gene_S = c("-7.73098", "-7.20226", "-8.04388", 
"-7.68191", "A2.6"), Gene_T = c("-5.09079", "-4.52039", "-4.75427", 
"-5.4321", "A1.9")), row.names = c(NA, 5L), class = "data.frame")


0 commentaires

3 Réponses :


1
votes

Ces données sont mal formatées dans le document source: avoir la ligne ID dans les données endommage toutes les colonnes numériques en chaînes. Vous devez d'abord convaincre quiconque vous donne ce fichier de données de fournir un ensemble de données raisonnable (les colonnes de classe mixte ne sont pas raisonnables, à mon avis).

En l'absence de cela, supprimez la ligne ID , remodelez-la , puis fusionnez ces données avec les données restantes remodelées.

B1_merged <- merge(B1_Test_melt, B1_IDs, by = "Genes", all = TRUE)
head(B1_merged)
#    Genes Timepoints     value   ID
# 1 Gene_A          1  -2.05066 A1.1
# 2 Gene_A          2 -0.657222 A1.1
# 3 Gene_A          3  -1.49477 A1.1
# 4 Gene_A          5  -1.80191 A1.1
# 5 Gene_B          1  -8.35787 A1.2
# 6 Gene_B          2  -9.52402 A1.2

Maintenant, faites votre remodelage sur les lignes non- ID :

B1_Test_melt <- melt(B1_Test[B1_Test$Timepoints != "ID", ] ,  id.vars = 'Timepoints', variable.name = 'Genes')
B1_Test_melt
# *** output flushed ***
head(B1_Test_melt)
#   Timepoints  Genes     value
# 1          1 Gene_A  -2.05066
# 2          2 Gene_A -0.657222
# 3          3 Gene_A  -1.49477
# 4          5 Gene_A  -1.80191
# 5          1 Gene_B  -8.35787
# 6          2 Gene_B  -9.52402

Et fusionnez les deux ensemble:

B1_IDs <- melt(B1_Test[ B1_Test$Timepoints == "ID", ], id.vars = 'Timepoints', variable.name = 'Genes', value.name = 'ID')[, c("Genes", "ID")]
head(B1_IDs)
#    Genes   ID
# 1 Gene_A A1.1
# 2 Gene_B A1.2
# 3 Gene_C A1.3
# 4 Gene_D A1.4
# 5 Gene_E A1.5
# 6 Gene_F A1.1

(Sauf si je manque quelque chose, vous voudrez probablement aussi faire B1_merged $ value . Notez également que Genes est un facteur , réparable à l'aide de as.character code > si nécessaire.)


0 commentaires

1
votes

La première chose à faire serait de séparer les ID des données:

> final <- merge(Gene_vals, Gene_ID, by.x="Genes", by.y="row.names")
> head(final)
   Genes Timepoints     value   X5
1 Gene_A          1 -2.050660 A1.1
2 Gene_A          2 -0.657222 A1.1
3 Gene_A          3 -1.494770 A1.1
4 Gene_A          5 -1.801910 A1.1
5 Gene_B          1 -8.357870 A1.2
6 Gene_B          2 -9.524020 A1.2

Puis fondre les lignes non-ID:

> Gene_vals <- melt( B1_Test[-5,],  id.vars = 'Timepoints', variable.name = 'Genes')
> head(Gene_vals)
  Timepoints  Genes     value
1          1 Gene_A  -2.05066
2          2 Gene_A -0.657222
3          3 Gene_A  -1.49477
4          5 Gene_A  -1.80191
5          1 Gene_B  -8.35787
6          2 Gene_B  -9.52402
> str(Gene_vals)
'data.frame':   72 obs. of  3 variables:
 $ Timepoints: chr  "1" "2" "3" "5" ...
 $ Genes     : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ value     : chr  "-2.05066" "-0.657222" "-1.49477" "-1.80191" ...
> Gene_vals$value <- as.numeric(Gene_vals$value)
> str(Gene_vals)
'data.frame':   72 obs. of  3 variables:
 $ Timepoints: chr  "1" "2" "3" "5" ...
 $ Genes     : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ value     : num  -2.051 -0.657 -1.495 -1.802 -8.358 ...

Et les fusionner:

Gene_ID <- data.frame( t( B1_Test[5,-1]))
> Gene_ID
         X5
Gene_A A1.1
Gene_B A1.2
Gene_C A1.3
Gene_D A1.4
Gene_E A1.5
snip....


1 commentaires

Merci à tous pour les suggestions utiles. En suivant les suggestions ci-dessus, le problème est désormais résolu.



2
votes

Solution de base R:

df <-
  structure(
    list(
      Timepoints = c("1", "2", "3", "5", "ID"),
      Gene_A = c("-2.05066",
                 "-0.657222", "-1.49477", "-1.80191", "A1.1"),
      Gene_B = c("-8.35787",
                 "-9.52402", "-10.6604", "-10.516", "A1.2"),
      Gene_C = c("-2.06287",
                 "-0.846725", "-1.63796", "-1.31922", "A1.3"),
      Gene_D = c("-3.83545",
                 "-1.19723", "-1.53115", "-3.25903", "A1.4"),
      Gene_E = c("-6.59039",
                 "-5.98822", "-6.23785", "-5.00584", "A1.5"),
      Gene_F = c("-5.02469",
                 "-4.41637", "-5.46219", "-3.97594", "A1.1"),
      Gene_G = c("-7.75424",
                 "-8.17158", "-7.90569", "-8.01352", "A1.6"),
      Gene_H = c("-4.65703",
                 "-3.42328", "-4.08867", "-3.76642", "A1.2"),
      Gene_I = c("-11.7749",
                 "-11.649", "-11.3751", "-10.3728", "A1.3"),
      Gene_K = c("-4.08981",
                 "-3.09873", "-3.95986", "-3.97249", "A1.4"),
      Gene_L = c(NA, "-19.7923",
                 NA, "-15.1216", "A1.5"),
      Gene_M = c("-4.11469", "-3.19647", "-3.99615",
                 "-3.06183", "A1.6"),
      Gene_N = c("-6.53017", "-6.16685", "-6.865",
                 "-6.44303", "A1.9"),
      Gene_O = c("-4.58034", "-3.45153", "-4.86697",
                 "-5.25414", "A2.2"),
      Gene_P = c("-3.45614", "-2.72413", "-2.75492",
                 "-2.76479", "A2.2"),
      Gene_R = c("-5.24809", "-4.15782", "-5.28192",
                 "-5.72024", "A2.6"),
      Gene_S = c("-7.73098", "-7.20226", "-8.04388",
                 "-7.68191", "A2.6"),
      Gene_T = c("-5.09079", "-4.52039", "-4.75427",
                 "-5.4321", "A1.9")
    ),
    row.names = c(NA, 5L),
    class = "data.frame"
  )

Données:

# Create a dataframe comrpised of the ID & gene vectors: 

ID <- data.frame(t(df[nrow(df),]), stringsAsFactors = F)

ID <- data.frame(cbind(Genes = row.names(ID)[2:nrow(ID)], ID = ID[2:nrow(ID),]),

                 stringsAsFactors = F,

                 row.names = NULL)

# Melt the original dataframe (less the ID rows) into long format: 

df_long <- data.frame(

  reshape(

    df[1:(nrow(df)-1),],

    direction = "long",

    varying = names(df)[names(df) != "Timepoints"],

    v.names = "value",

    times = names(df)[names(df) != "Timepoints"],

    timevar = "Genes"

  ),

  row.names = NULL

)

# Left join the dataframe holding the IDs and the long df: 

df_long <- merge(df_long, ID, by = "Genes", all.x = T)


0 commentaires