J'ai une trame de données, il y a 4 points temporels (lignes) et 18 gènes (colonnes), et dans l'une des lignes, il y a un identifiant associé à chaque gène qui serait utile lors du tracé d'un graphique linéaire dans ggplot2. J'importe les données, puis j'utilise la fonction melt () pour organiser les données du format large au format long. Ce que j'observe, c'est que les ID s'interrompent au milieu de la trame de données. Je veux qu'ils soient disposés dans la dernière colonne, ce qui serait bénéfique lors du tracé en ligne à l'aide de la bibliothèque ggplot2. Veuillez m'aider avec ceci.
Merci,
Toufiq
Importation de données
dput((B1_Test_v1)) structure(list(Timepoints = c(1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L ), Genes = c("Gene_A", "Gene_A", "Gene_A", "Gene_A", "Gene_B", "Gene_B", "Gene_B", "Gene_B", "Gene_C", "Gene_C", "Gene_C", "Gene_C", "Gene_D", "Gene_D", "Gene_D", "Gene_D", "Gene_E", "Gene_E", "Gene_E", "Gene_E", "Gene_F", "Gene_F", "Gene_F", "Gene_F"), value = c(-2.05066, -0.657222, -1.49477, -1.80191, -8.35787, -9.52402, -10.6604, -10.516, -2.06287, -0.846725, -1.63796, -1.31922, -3.83545, -1.19723, -1.53115, -3.25903, -6.59039, -5.98822, -6.23785, -5.00584, -5.02469, -4.41637, -5.46219, -3.97594), ID = c("A1.1", "A1.1", "A1.1", "A1.1", "A1.2", "A1.2", "A1.2", "A1.2", "A1.3", "A1.3", "A1.3", "A1.3", "A1.4", "A1.4", "A1.4", "A1.4", "A1.5", "A1.5", "A1.5", "A1.5", "A1.1", "A1.1", "A1.1", "A1.1")), class = "data.frame", row.names = c(NA, -24L))
Utilisation de melt () pour organiser les données du format large au format lon
require(reshape2) B1_Test_melt <- melt(B1_Test , id.vars = 'Timepoints', variable.name = 'Genes') dput((B1_Test_melt)) structure(list(Timepoints = c("1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID" ), Genes = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L), .Label = c("Gene_A", "Gene_B", "Gene_C", "Gene_D", "Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_K", "Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_R", "Gene_S", "Gene_T"), class = "factor"), value = c("-2.05066", "-0.657222", "-1.49477", "-1.80191", "A1.1", "-8.35787", "-9.52402", "-10.6604", "-10.516", "A1.2", "-2.06287", "-0.846725", "-1.63796", "-1.31922", "A1.3", "-3.83545", "-1.19723", "-1.53115", "-3.25903", "A1.4", "-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5", "-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1", "-7.75424", "-8.17158", "-7.90569", "-8.01352", "A1.6", "-4.65703", "-3.42328", "-4.08867", "-3.76642", "A1.2", "-11.7749", "-11.649", "-11.3751", "-10.3728", "A1.3", "-4.08981", "-3.09873", "-3.95986", "-3.97249", "A1.4", NA, "-19.7923", NA, "-15.1216", "A1.5", "-4.11469", "-3.19647", "-3.99615", "-3.06183", "A1.6", "-6.53017", "-6.16685", "-6.865", "-6.44303", "A1.9", "-4.58034", "-3.45153", "-4.86697", "-5.25414", "A2.2", "-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2", "-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6", "-7.73098", "-7.20226", "-8.04388", "-7.68191", "A2.6", "-5.09079", "-4.52039", "-4.75427", "-5.4321", "A1.9")), row.names = c(NA, -90L), class = "data.frame")
Résultat attendu
B1_Test <- read.csv(file ="./B1_Test.csv", stringsAsFactors = FALSE) dput(head(B1_Test)) structure(list(Timepoints = c("1", "2", "3", "5", "ID"), Gene_A = c("-2.05066", "-0.657222", "-1.49477", "-1.80191", "A1.1"), Gene_B = c("-8.35787", "-9.52402", "-10.6604", "-10.516", "A1.2"), Gene_C = c("-2.06287", "-0.846725", "-1.63796", "-1.31922", "A1.3"), Gene_D = c("-3.83545", "-1.19723", "-1.53115", "-3.25903", "A1.4"), Gene_E = c("-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5"), Gene_F = c("-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1"), Gene_G = c("-7.75424", "-8.17158", "-7.90569", "-8.01352", "A1.6"), Gene_H = c("-4.65703", "-3.42328", "-4.08867", "-3.76642", "A1.2"), Gene_I = c("-11.7749", "-11.649", "-11.3751", "-10.3728", "A1.3"), Gene_K = c("-4.08981", "-3.09873", "-3.95986", "-3.97249", "A1.4"), Gene_L = c(NA, "-19.7923", NA, "-15.1216", "A1.5"), Gene_M = c("-4.11469", "-3.19647", "-3.99615", "-3.06183", "A1.6"), Gene_N = c("-6.53017", "-6.16685", "-6.865", "-6.44303", "A1.9"), Gene_O = c("-4.58034", "-3.45153", "-4.86697", "-5.25414", "A2.2"), Gene_P = c("-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2"), Gene_R = c("-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6"), Gene_S = c("-7.73098", "-7.20226", "-8.04388", "-7.68191", "A2.6"), Gene_T = c("-5.09079", "-4.52039", "-4.75427", "-5.4321", "A1.9")), row.names = c(NA, 5L), class = "data.frame")
3 Réponses :
Ces données sont mal formatées dans le document source: avoir la ligne ID
dans les données endommage toutes les colonnes numériques en chaînes. Vous devez d'abord convaincre quiconque vous donne ce fichier de données de fournir un ensemble de données raisonnable (les colonnes de classe mixte ne sont pas raisonnables, à mon avis).
En l'absence de cela, supprimez la ligne ID
, remodelez-la , puis fusionnez ces données avec les données restantes remodelées.
B1_merged <- merge(B1_Test_melt, B1_IDs, by = "Genes", all = TRUE) head(B1_merged) # Genes Timepoints value ID # 1 Gene_A 1 -2.05066 A1.1 # 2 Gene_A 2 -0.657222 A1.1 # 3 Gene_A 3 -1.49477 A1.1 # 4 Gene_A 5 -1.80191 A1.1 # 5 Gene_B 1 -8.35787 A1.2 # 6 Gene_B 2 -9.52402 A1.2
Maintenant, faites votre remodelage sur les lignes non- ID
:
B1_Test_melt <- melt(B1_Test[B1_Test$Timepoints != "ID", ] , id.vars = 'Timepoints', variable.name = 'Genes') B1_Test_melt # *** output flushed *** head(B1_Test_melt) # Timepoints Genes value # 1 1 Gene_A -2.05066 # 2 2 Gene_A -0.657222 # 3 3 Gene_A -1.49477 # 4 5 Gene_A -1.80191 # 5 1 Gene_B -8.35787 # 6 2 Gene_B -9.52402
Et fusionnez les deux ensemble:
B1_IDs <- melt(B1_Test[ B1_Test$Timepoints == "ID", ], id.vars = 'Timepoints', variable.name = 'Genes', value.name = 'ID')[, c("Genes", "ID")] head(B1_IDs) # Genes ID # 1 Gene_A A1.1 # 2 Gene_B A1.2 # 3 Gene_C A1.3 # 4 Gene_D A1.4 # 5 Gene_E A1.5 # 6 Gene_F A1.1
(Sauf si je manque quelque chose, vous voudrez probablement aussi faire B1_merged $ value . Notez également que
Genes
est un facteur
, réparable à l'aide de as.character code > si nécessaire.)
La première chose à faire serait de séparer les ID des données:
> final <- merge(Gene_vals, Gene_ID, by.x="Genes", by.y="row.names") > head(final) Genes Timepoints value X5 1 Gene_A 1 -2.050660 A1.1 2 Gene_A 2 -0.657222 A1.1 3 Gene_A 3 -1.494770 A1.1 4 Gene_A 5 -1.801910 A1.1 5 Gene_B 1 -8.357870 A1.2 6 Gene_B 2 -9.524020 A1.2
Puis fondre les lignes non-ID:
> Gene_vals <- melt( B1_Test[-5,], id.vars = 'Timepoints', variable.name = 'Genes') > head(Gene_vals) Timepoints Genes value 1 1 Gene_A -2.05066 2 2 Gene_A -0.657222 3 3 Gene_A -1.49477 4 5 Gene_A -1.80191 5 1 Gene_B -8.35787 6 2 Gene_B -9.52402 > str(Gene_vals) 'data.frame': 72 obs. of 3 variables: $ Timepoints: chr "1" "2" "3" "5" ... $ Genes : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ... $ value : chr "-2.05066" "-0.657222" "-1.49477" "-1.80191" ... > Gene_vals$value <- as.numeric(Gene_vals$value) > str(Gene_vals) 'data.frame': 72 obs. of 3 variables: $ Timepoints: chr "1" "2" "3" "5" ... $ Genes : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ... $ value : num -2.051 -0.657 -1.495 -1.802 -8.358 ...
Et les fusionner:
Gene_ID <- data.frame( t( B1_Test[5,-1])) > Gene_ID X5 Gene_A A1.1 Gene_B A1.2 Gene_C A1.3 Gene_D A1.4 Gene_E A1.5 snip....
Merci à tous pour les suggestions utiles. En suivant les suggestions ci-dessus, le problème est désormais résolu.
Solution de base R:
df <- structure( list( Timepoints = c("1", "2", "3", "5", "ID"), Gene_A = c("-2.05066", "-0.657222", "-1.49477", "-1.80191", "A1.1"), Gene_B = c("-8.35787", "-9.52402", "-10.6604", "-10.516", "A1.2"), Gene_C = c("-2.06287", "-0.846725", "-1.63796", "-1.31922", "A1.3"), Gene_D = c("-3.83545", "-1.19723", "-1.53115", "-3.25903", "A1.4"), Gene_E = c("-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5"), Gene_F = c("-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1"), Gene_G = c("-7.75424", "-8.17158", "-7.90569", "-8.01352", "A1.6"), Gene_H = c("-4.65703", "-3.42328", "-4.08867", "-3.76642", "A1.2"), Gene_I = c("-11.7749", "-11.649", "-11.3751", "-10.3728", "A1.3"), Gene_K = c("-4.08981", "-3.09873", "-3.95986", "-3.97249", "A1.4"), Gene_L = c(NA, "-19.7923", NA, "-15.1216", "A1.5"), Gene_M = c("-4.11469", "-3.19647", "-3.99615", "-3.06183", "A1.6"), Gene_N = c("-6.53017", "-6.16685", "-6.865", "-6.44303", "A1.9"), Gene_O = c("-4.58034", "-3.45153", "-4.86697", "-5.25414", "A2.2"), Gene_P = c("-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2"), Gene_R = c("-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6"), Gene_S = c("-7.73098", "-7.20226", "-8.04388", "-7.68191", "A2.6"), Gene_T = c("-5.09079", "-4.52039", "-4.75427", "-5.4321", "A1.9") ), row.names = c(NA, 5L), class = "data.frame" )
Données:
# Create a dataframe comrpised of the ID & gene vectors: ID <- data.frame(t(df[nrow(df),]), stringsAsFactors = F) ID <- data.frame(cbind(Genes = row.names(ID)[2:nrow(ID)], ID = ID[2:nrow(ID),]), stringsAsFactors = F, row.names = NULL) # Melt the original dataframe (less the ID rows) into long format: df_long <- data.frame( reshape( df[1:(nrow(df)-1),], direction = "long", varying = names(df)[names(df) != "Timepoints"], v.names = "value", times = names(df)[names(df) != "Timepoints"], timevar = "Genes" ), row.names = NULL ) # Left join the dataframe holding the IDs and the long df: df_long <- merge(df_long, ID, by = "Genes", all.x = T)