Skip to content

Regression Models

This page is a part of Economic Statistics (ECON304) class taught at Faculty of Economics, Chiang Mai University in Thailand.

This site aim to provide the explanation of Regression Model using R programming

Students are expected to be able to:

  1. Compute covariance of two variables
  2. Compute correlation coefficient of two variables
  3. Visualise the data of variable
  4. Conduct t-test for mean testing
  5. Conduct a simple regression model
  6. Develop a better model using multiple regression

The followings are the code:

1. Get it started


# Regression model

# 1. Install package "AER"

install.packages("AER")

library(AER)

# 2. load the data from package = CPS1985

data(CPS1985)

2. Descriptive Analysis of Data


# 3. Have a look at the CPS data

names(CPS1985) # call the variable names

str(CPS1985) # show the structure of the variables

summary(CPS1985) # show the min qualities and max value

?CPS1985




# in CPS data, # the potential dependent variable is "wage" # let's explore the wage # if wage is our dependent variable, # then we are interested in variation of wage # Calculate stats representing variation var(CPS1985$wage) # calculate variance sd(CPS1985$wage) # calculate standard deviation # Data viz of wage boxplot(CPS1985$wage) # boxplot of wage
boxplot(CPS1985$wage, main = "A boxplot of the wage in 1985", col = "blueviolet" ) hist(CPS1985$wage, col = "blueviolet", main = "Histogram of wage in 1985")
barplot(CPS1985$wage, col = "blueviolet") # now we will construct the regression model # we have dependent variable as "wage" # so what should it be the independent variable? # let's try with experience # so we shall visualise the relationship plot(CPS1985$experience, CPS1985$wage, main = "Scatter plot between wage and experience", ylab = "Wage (USD/hour)", xlab = "Experience ( years of potential work experience )" , col = "Blue") # plot x&y cov(CPS1985$experience, CPS1985$wage) # covariance cor(CPS1985$experience, CPS1985$wage) # correlation boxplot(CPS1985$experience) 

3. Regression Model Development


# regression model will look like:

# y = a + b*x

# E(wage) = alpha + beta*experience

model1 <- lm(wage ~ experience, data = CPS1985) # create model1

summary(model1)

# E(wage) = 8.37997 + 0.03614*experience

# R=square = 0.00757

let's interpret the model

  1. Alpha (Intercept)
    alpha = 8.380
    alpha is the value of Y when X is 0.
    When a labour has no experience, the expected wage will be 8.380 USD/hour
  2. Beta (Slope)
    beta = 0.0361
    bata show how much Y will change when X increase by 1 unit
    When labours have 1 year more experience,
    They will get 0.0361 USD/hour increase in their wage
  3. R square
    R-square = 0.00757
    R-square means the proportion of variation in Y. That can be explained by the predictor(s) in the model. Experience can explain 0.757% of variation in wage. The rest of 99.243% can be explained by other variable

###################

4. Variable transformation


# Now we will try to fix the problem

# start with normality

# wage was found to be positively skew

install.packages("psych")

library(psych)

skew(CPS1985$wage) # calculate skewness

# so we have to inform autobot to reduce the size of wage

# Bubble bee suggested that we can use square root

# so we will create a new variable

# that is the square root of wage

CPS1985$sqrt_wage <- sqrt(CPS1985$wage)

boxplot(CPS1985$sqrt_wage,

main = "square root of wage ",

col = "blue")

boxplot(CPS1985$wage,

main = "wage",

col = "red")

skew(CPS1985$sqrt_wage)

# sqrt still results the positive skew

# so we use the higher level of transformer

# that is log

CPS1985$log_wage <- log(CPS1985$wage)

boxplot(CPS1985$wage,

main = "wage",

col = "red")

boxplot(CPS1985$sqrt_wage,

main = "square root of wage ",

col = "blue")

boxplot(CPS1985$log_wage,

main = "log of wage ",

col = "magenta")

skew(CPS1985$log_wage)

boxplot(CPS1985$wage, CPS1985$sqrt_wage, CPS1985$log_wage)

skew(CPS1985$experience)

skew(sqrt(CPS1985$experience))

skew(log(CPS1985$experience))

# now we will construct the model using transform variable

model2 <- lm(log(wage) ~ sqrt(experience), data = CPS1985)

summary(model2)

plot(log(CPS1985$wage), sqrt(CPS1985$experience))

##############################################

5. Multiple Regression Model Development


# Now from simple regression

# We will develop to multiple regression

# The question is that

# will the inclusion of education make the model better

boxplot(CPS1985$education)

skew(CPS1985$education)

boxplot((CPS1985$education)^2)

skew((CPS1985$education)^2)

# so we use just education

# now we gonna add education into the model2

# we then create model3 by add education to model2

model3 <- lm(log(wage) ~ sqrt(experience) + education,

data = CPS1985)

summary(model3)

# log(wage) = 0.394 + 0.105*sqrt(exp) + 0.09*edu

# R-square is 23.44%

# to statistically test if the model3 is better than model2

# we use ANOVA test

anova(model2, model3)

####

# next add the age

boxplot(CPS1985$age)

skew(CPS1985$age) # =0.5452207

skew(sqrt(CPS1985$age)) # = 0.2817645

skew(log(CPS1985$age)) # 0.003625903

boxplot(log(CPS1985$age))

model4 <- lm(log(wage) ~ sqrt(experience) + education

+ log(age),

data = CPS1985)

summary(model4)

# R-squared: 0.2423

anova(model3, model4)

# to test assumption

plot(model4)

# we gonna plot all 4 in the same time

par(mfrow=c(2, 2)) # split window to 2x2 scale

plot(log(CPS1985$wage), log(CPS1985$age))

model5 <- lm(log(wage) ~ log(age) + age, data = CPS1985)

summary(model5)

model6 <- lm(wage ~ education + experience + education*experience,

data = CPS1985)

summary(model6)

model7 <- lm(wage ~ experience*age, data = CPS1985)

summary(model7)

#########################

6. Output Visualisation


# finally we will viasualise the output

CPS1985$sqrt_experience <- sqrt(CPS1985$experience)

model2 <- lm(log_wage ~ sqrt_experience

, data = CPS1985)

summary(model2)

plot(CPS1985$sqrt_experience, CPS1985$log_wage)

abline(model2, col = "red")

%d bloggers like this: