yield <- 75.5 # Double (decimal)
num_plots <- 120 # Integer (whole number)
class(yield) # Check the type[1] "numeric"
class(num_plots) # Often stored as 'numeric' (double) by default[1] "numeric"
Think of data like building blocks:
Understanding these is fundamental to working with data in R.
R needs to know what kind of information it’s dealing with.
yield <- 75.5 # Double (decimal)
num_plots <- 120 # Integer (whole number)
class(yield) # Check the type[1] "numeric"
class(num_plots) # Often stored as 'numeric' (double) by default[1] "numeric"
") or single (') quotes. Used for IDs, names, descriptions.variety_name <- "ICARDA_RustResist"
plot_id <- 'Plot_A101'
class(variety_name)[1] "character"
is_resistant <- TRUE
yield > 80 # This comparison results in a logical value[1] FALSE
class(is_resistant)[1] "logical"
# Example: Different locations in a trial
locations <- c("Baku", "Ganja", "Baku", "Sheki", "Ganja")
location_factor <- factor(locations)
print(location_factor) # Shows levels[1] Baku Ganja Baku Sheki Ganja
Levels: Baku Ganja Sheki
class(location_factor)[1] "factor"
levels(location_factor) # See the unique categories[1] "Baku" "Ganja" "Sheki"
How R organizes collections of data:
c() (combine function).# Vector of plot yields (numeric)
plot_yields <- c(75.5, 81.2, 78.9, 85.0)
# Vector of variety names (character)
plot_varieties <- c("ICARDA_Gold", "Local_Check", "ICARDA_Gold",
"ICARDA_RustResist")
# Vector of resistance status (logical)
plot_resistance <- c(TRUE, FALSE, TRUE, TRUE)
plot_yields[1] # Access the first element (Indexing starts at 1!)[1] 75.5
plot_yields[2:4] # Access elements 2 through 4[1] 81.2 78.9 85.0
length(plot_yields) # Get the number of elements[1] 4
Important: If you mix types in c(), R will force them into a single common type (usually character).
mixed_vector <- c(10, "VarietyA", TRUE)
print(mixed_vector) # All become character strings![1] "10" "VarietyA" "TRUE"
class(mixed_vector)# Example: Small genotype matrix (Individuals x SNPs)[1] "character"
# Example: Small genotype matrix (Individuals x SNPs)
genotype_data <- matrix(c(0, 1, 2, 1, 1, 0), nrow = 2, ncol = 3, byrow = TRUE)
rownames(genotype_data) <- c("Line1", "Line2")
colnames(genotype_data) <- c("SNP1", "SNP2", "SNP3")
print(genotype_data) SNP1 SNP2 SNP3
Line1 0 1 2
Line2 1 1 0
class(genotype_data)[1] "matrix" "array"
dim(genotype_data) # Get dimensions (rows, columns)[1] 2 3
genotype_data[1, 2] # Access element row 1, column 2[1] 1
Data Frame: The most important data structure for breeders! Like a spreadsheet or table in R.
# Create a simple breeding trial data frame
trial_data <- data.frame(
PlotID = c("A101", "A102", "B101", "B102"),
Variety = factor(c("ICARDA_Gold", "Local_Check", "ICARDA_RustResist",
"ICARDA_Gold")),
Yield_kg_plot = c(5.2, 4.5, 6.1, 5.5),
Is_Resistant = c(TRUE, FALSE, TRUE, TRUE)
)
print(trial_data) PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE
class(trial_data)[1] "data.frame"
str(trial_data) # Structure: Shows types of each column - VERY USEFUL!'data.frame': 4 obs. of 4 variables:
$ PlotID : chr "A101" "A102" "B101" "B102"
$ Variety : Factor w/ 3 levels "ICARDA_Gold",..: 1 3 2 1
$ Yield_kg_plot: num 5.2 4.5 6.1 5.5
$ Is_Resistant : logi TRUE FALSE TRUE TRUE
head(trial_data) # Show first few rows PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE
summary(trial_data) # Summary statistics for each column PlotID Variety Yield_kg_plot Is_Resistant
Length:4 ICARDA_Gold :2 Min. :4.500 Mode :logical
Class :character ICARDA_RustResist:1 1st Qu.:5.025 FALSE:1
Mode :character Local_Check :1 Median :5.350 TRUE :3
Mean :5.325
3rd Qu.:5.650
Max. :6.100
# Access columns using $
trial_data$Yield_kg_plot[1] 5.2 4.5 6.1 5.5
mean(trial_data$Yield_kg_plot) # Calculate mean of a column[1] 5.325
(We will work extensively with data frames).
analysis_results <- list(
description = "Yield Trial - Baku 2023",
raw_data = trial_data, # Include the data frame
significant_snps = c("SNP101", "SNP504"), # A character vector
model_parameters = list(threshold = 0.05, method = "MLM") # A nested list
)
print(analysis_results$description)[1] "Yield Trial - Baku 2023"
print(analysis_results$raw_data) # Access the data frame inside the list PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE