A Guide to Normalizing Data for Different Treatments in R
I have data, as shown below, regarding iron contents in soil and the plant uptake of iron at different growth stages in winter wheat. I want to analyze the relationship between the iron content in the soil and the plant uptake of iron at different growth stages in winter wheat.
dataA= data.frame(read_csv(url(github), show_col_types= FALSE))
Location Season Genotype Reps Iron_ton_ha Stage Fe
1 East 2021 CV1 1 21.7127 Vegetative 0.44
2 East 2021 CV1 2 8.7340 Vegetative 0.30
3 East 2021 CV1 3 9.5003 Vegetative 0.31
4 East 2021 CV1 4 5.9481 Vegetative 0.37
5 East 2021 CV1 5 7.4608 Vegetative 0.30
6 East 2021 CV1 6 20.5326 Vegetative 0.33
7 East 2021 CV1 7 19.8532 Vegetative 0.29
8 East 2021 CV1 8 7.9718 Vegetative 0.35
9 East 2021 CV1 9 15.0087 Vegetative 0.38
10 East 2021 CV1 10 5.6608 Vegetative 0.40
We can simply draw a regression graph. However, before doing that, we need to reshape the data. I’ll transpose the data from rows to columns based on the variables in the Stage column.
dataB= data.frame(dataA %>%
group_by(Location, Season, Genotype, Iron_ton_ha, Stage) %>%
spread(key=Stage, value = Fe))
Location Season Genotype Iron_ton_ha Maturity Reproductive Vegetative
1 East 2021 CV1 5.6608 0.14 0.31 0.40
2 East 2021 CV1 5.9481 0.18 0.30 0.37
3 East 2021 CV1 6.2757 0.08 0.29 0.38
4 East 2021 CV1 7.4608 0.12 0.34 0.30
5 East 2021 CV1 7.5574 0.15 0.28 0.36
6 East 2021 CV1 7.8064 0.15 0.29 0.34
7 East 2021 CV1 7.9718 0.17 0.31 0.35
8 East 2021 CV1 8.0175 0.15 0.30 0.45
9 East 2021 CV1 8.2875 0.14 0.31 0.36
10 East 2021 CV1 8.6052 0.11 0.32 0.38
Now, I will analyze the relationship between the iron (Fe) content in the soil and the uptake of iron by plants at maturity.
FIGA= ggplot(data=dataB, aes(x=Iron_ton_ha, y=Reproductive))+
geom_point(aes(fill=as.factor(Season), shape=as.factor(Season)),color="black", size=5) +
scale_fill_manual(values= c("grey15","grey35","grey55")) +
scale_shape_manual(values= c(21,22,24)) +
scale_x_continuous(breaks=seq(0,60,20),limits=c(0,60)) +
scale_y_continuous(breaks=seq(0,0.5,0.1),limits=c(0,0.5)) +
facet_wrap(~ Location) +
annotate("segment", x=20, xend=40, y=Inf, yend=Inf, color="black",lwd=1)+
labs(x="Fe (Mg/ha) in soil", y="Plant Fe uptake (%) at maturity") +
theme_classic(base_size=18, base_family="serif") +
legend.key=element_rect(color="white", fill="white"),
legend.text=element_text(family="serif", face="plain", size=13, color="black"),
legend.background= element_rect(fill="white"),
strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
axis.line = element_line(linewidth = 0.5, colour="black"))
FIGA + windows(width=9, height=5)
FIGA, width=9*2.54, height=5*2.54, units="cm", dpi=1000)

Now, I’ll normalize the data for each treatment combination (i.e., East, 2021, CV1) using the mutate()
function. For easier normalization calculations, a vertically organized data format is preferable. Therefore, I’ll be using the dataA
dataC= data.frame(dataA %>%
group_by(Location, Season, Genotype) %>%
Location Season Genotype Reps Iron_ton_ha Stage Fe Normalized_Fe_plant Normalized_Fe_soil
1 East 2021 CV1 1 21.7127 Vegetative 0.44 1.682394399 1.85849093
2 East 2021 CV1 2 8.7340 Vegetative 0.30 0.246204058 -0.48486395
3 East 2021 CV1 3 9.5003 Vegetative 0.31 0.348789083 -0.34650550
4 East 2021 CV1 4 5.9481 Vegetative 0.37 0.964299229 -0.98786906
5 East 2021 CV1 5 7.4608 Vegetative 0.30 0.246204058 -0.71474518
Then, let’s reshape the data from row to column regarding the Stage.
dataD= data.frame(dataC %>%
group_by(Location, Season, Genotype, Normalized_Fe_soil, Stage) %>%
spread(key=Stage, value= Normalized_Fe_plant))
Location Season Genotype Normalized_Fe_soil Maturity Reproductive Vegetative
1 East 2021 CV1 -1.03974219 -1.39515633 0.348789083 1.272054302
2 East 2021 CV1 -0.98786906 -0.98481623 0.246204058 0.964299229
3 East 2021 CV1 -0.92871960 -2.01066648 0.143619034 1.066884253
4 East 2021 CV1 -0.71474518 -1.60032638 0.656544156 0.246204058
5 East 2021 CV1 -0.69730367 -1.29257131 0.041034010 0.861714204
Let’s draw the graph again.
FIGB= ggplot(data=dataD, aes(x=Normalized_Fe_soil, y=Maturity))+
geom_point(aes(fill=as.factor(Season), shape=as.factor(Season)),color="black", size=5) +
scale_fill_manual(values= c("grey15","grey35","grey55")) +
scale_shape_manual(values= c(21,22,24)) +
scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
geom_vline(xintercept=0, linetype="dashed", color="black") +
geom_hline(yintercept=0, linetype="dashed", color= "black") +
facet_wrap(~ Location) +
annotate("segment", x=20, xend=40, y=Inf, yend=Inf, color="black",lwd=1)+
labs(x="Fe (Mg/ha) in soil", y="Plant Fe uptake (%) at maturity") +
theme_classic(base_size=18, base_family="serif") +
legend.key=element_rect(color="white", fill="white"),
legend.text=element_text(family="serif", face="plain", size=13, color="black"),
legend.background= element_rect(fill="white"),
strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
axis.line = element_line(linewidth = 0.5, colour="black"))
FIGB + windows(width=9, height=5)
FIGB, width=9*2.54, height=5*2.54, units="cm", dpi=1000)

### full code
#data upload
dataA= data.frame(read_csv(url(github), show_col_types= FALSE))
# data normalization
dataB= data.frame(dataA %>%
group_by(Location, Season, Genotype) %>%
# data reshape
dataC= data.frame(dataB %>%
group_by(Location, Season, Genotype, Normalized_Fe_soil, Stage) %>%
spread(key=Stage, value= Normalized_Fe_plant))
# graph
FIGB= ggplot(data=dataC, aes(x=Normalized_Fe_soil, y=Maturity))+
geom_point(aes(fill=as.factor(Season), shape=as.factor(Season)),color="black", size=5) +
scale_fill_manual(values= c("grey15","grey35","grey55")) +
scale_shape_manual(values= c(21,22,24)) +
scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
geom_vline(xintercept=0, linetype="dashed", color="black") +
geom_hline(yintercept=0, linetype="dashed", color= "black") +
facet_wrap(~ Location) +
annotate("segment", x=20, xend=40, y=Inf, yend=Inf, color="black",lwd=1)+
labs(x="Fe (Mg/ha) in soil", y="Plant Fe uptake (%) at maturity") +
theme_classic(base_size=18, base_family="serif") +
legend.key=element_rect(color="white", fill="white"),
legend.text=element_text(family="serif", face="plain", size=13, color="black"),
legend.background= element_rect(fill="white"),
strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
axis.line = element_line(linewidth = 0.5, colour="black"))
FIGB + windows(width=9, height=5)
Is R calculation correct?
I have just used the code and now I’m wondering whether the code I used correctly calculates normalization. Therefore, I will manually calculate normalization and compare the values.

I pooled data from two seasons and normalized yield based on location and fertilizer. Now, I’ll check whether this normalized data matches what R calculated.
dataA= data.frame(read_csv(url(github), show_col_types= FALSE))
dataB= data.frame(dataA %>%
group_by(location, fertilizer) %>%
location surphur_amount season fertilizer yield mean Stdev normalized Normalized_R
1 North 0 2020 Control 100.5 122.5 15.1 -1.45 -1.45
2 North 60 2020 Control 120.3 122.5 15.1 -0.14 -0.14
3 North 120 2020 Control 135.9 122.5 15.1 0.89 0.89
4 North 0 2021 Control 110.3 122.5 15.1 -0.81 -0.81
5 North 60 2021 Control 128.3 122.5 15.1 0.39 0.39
6 North 120 2021 Control 139.5 122.5 15.1 1.13 1.13
7 North 0 2020 Fast 149.0 145.5 19.9 0.17 0.17
8 North 60 2020 Fast 166.3 145.5 19.9 1.04 1.04
9 North 120 2020 Fast 169.9 145.5 19.9 1.23 1.23
10 North 0 2021 Fast 119.2 145.5 19.9 -1.32 -1.32
11 North 60 2021 Fast 133.0 145.5 19.9 -0.63 -0.63
12 North 120 2021 Fast 135.9 145.5 19.9 -0.48 -0.48
13 North 0 2020 Slow 171.3 167.3 22.9 0.17 0.17
14 North 60 2020 Slow 191.1 167.3 22.9 1.04 1.04
15 North 120 2020 Slow 195.3 167.3 22.9 1.23 1.23
16 North 0 2021 Slow 137.0 167.3 22.9 -1.32 -1.32
17 North 60 2021 Slow 152.9 167.3 22.9 -0.63 -0.63
18 North 120 2021 Slow 156.3 167.3 22.9 -0.48 -0.48
19 South 0 2020 Control 109.6 121.4 15.5 -0.76 -0.76
20 South 60 2020 Control 122.3 121.4 15.5 0.06 0.06
21 South 120 2020 Control 125.0 121.4 15.5 0.24 0.24
22 South 0 2021 Control 100.0 121.4 15.5 -1.38 -1.38
23 South 60 2021 Control 125.9 121.4 15.5 0.29 0.29
24 South 120 2021 Control 145.3 121.4 15.5 1.54 1.54
25 South 0 2020 Fast 163.9 160.1 21.9 0.17 0.17
26 South 60 2020 Fast 182.9 160.1 21.9 1.04 1.04
27 South 120 2020 Fast 186.9 160.1 21.9 1.23 1.23
28 South 0 2021 Fast 131.1 160.1 21.9 -1.32 -1.32
29 South 60 2021 Fast 146.3 160.1 21.9 -0.63 -0.63
30 South 120 2021 Fast 149.5 160.1 21.9 -0.48 -0.48
31 South 0 2020 Slow 167.8 164.0 22.4 0.17 0.17
32 South 60 2020 Slow 187.3 164.0 22.4 1.04 1.04
33 South 120 2020 Slow 191.4 164.0 22.4 1.23 1.23
34 South 0 2021 Slow 134.3 164.0 22.4 -1.32 -1.32
35 South 60 2021 Slow 149.8 164.0 22.4 -0.63 -0.63
36 South 120 2021 Slow 153.1 164.0 22.4 -0.48 -0.48
My calculation matches the R calculation, so I can trust the code.