探索性数据分析(Exploratory Data Analysis,简称EDA)是通过分析数据集以决定选择哪种方法合适统计推断的过程,也称描述性统计分析。
例如,对于一维数据,人们想知道数据是否近似地服从正态分布,是否呈现拖尾或截尾分布?它的分布是对称的,还是呈偏态的?分布是单峰、双峰,还是多峰的?这时就需要我们对数据进行探索性分析。对于数值型数据,经常要分析一个分布的集中趋势和离散程度,用来描述集中趋势的主要有均值、中位数;描述离散程度的主要有方差、标准差。
###########################################################################
#功能说明:计算描述性统计量
#传入参数:需要“计算的字符串”集合
#返回值:描述性统计量字符串集合
# 样本量|最小值|四分之一分位数|中位数|均值|四分之三分位数|最大值|众数|方差|标准差|相对误差|偏度|峰度
# cnt|mindata|firstdata|mediandata|meandata|thirddata|maxdata|modedata|vardata|sddata|relativeerror|skewnessdata|kurtosisdata
###########################################################################
#rs <- pg.spi.exec(sql)
str <- "4356|4736|4121|4607|4386|3333|3095|3636|3947|4615|3739|4538|3521|4605|3671|3828|3095|3947|4000|3552|4054|4615|3552|4453|4356"
mystr <- strsplit(str,split='|',fixed = TRUE)
abdata <-mystr[[1]]
abdata <- as.numeric(abdata)
#去掉重复值
abdata <- unique(abdata)
#去掉异常值
n <- which(abdata %in% boxplot.stats(abdata)$out)
if (length(n) > 0) abdata <- abdata[-n]
#描述性统计量
data1 <- summary(abdata)
data2 <- floor(abdata / 100)*100
#样本量
cnt <- length(abdata)
#最小值
mindata <- data1[['Min.']]
#四分之一分位数
firstdata <- data1[['1st Qu.']]
#中位数
mediandata <- data1[['Median']]
#均值
meandata <- data1[['Mean']]
#四分之三分位数
thirddata <- data1[['3rd Qu.']]
#最大值
maxdata <- data1[['Max.']]
#众数
modedata <- as.numeric(names(which.max(table(data2))))
#方差
vardata <- var(abdata)
#标准差
sddata <- sd(abdata)
#相对误差
relativeerror <- sddata / meandata
#偏度
skewnessdata <- cnt/((cnt-1)*(cnt-2))*sum((abdata-meandata)^3)/sddata^3
#峰度
kurtosisdata <- ((cnt*(cnt+1))/((cnt-1)*(cnt-2)*(cnt-3))*sum((abdata-meandata)^4)/sddata^4-(3*(cnt-1)^2)/((cnt-2)*(cnt-3)))
if (cnt == 1) {
strdata <- paste(cnt,mindata,firstdata,mediandata,meandata,thirddata,maxdata,modedata,'-1','-1','-1','-1','-1',sep = "|")
}else if(cnt > 1 && cnt <= 3) {
strdata <- paste(cnt,mindata,firstdata,mediandata,meandata,thirddata,maxdata,modedata,vardata,sddata,relativeerror,'-1','-1',sep = "|")
}else {
strdata <- paste(cnt,mindata,firstdata,mediandata,meandata,thirddata,maxdata,modedata,vardata,sddata,relativeerror,skewnessdata,kurtosisdata,sep = "|")
}
#return(strdata)
strdata
## [1] "20|3095|3662|4027|4040|4474|4736|4600|234010.765789474|483.746592535259|0.119739255578034|-0.232361220422573|-1.03769839990333"
利用均值和方差描述集中趋势和离散程度往往基于正态分布,而如果数据时长尾或是有异常值是,这时用均值和方差就不能正确地描述集中趋势和离散程度。
解决方案:1.中位数描述 2.截尾均值 3.稳健的四分位间距(IQR)和平均差(mad)来描述离散程度。 mean(salarym, trim=0.2) #两头截取20%
IQR(salarym)
mad(salarym)
由于绘制直方图时需要先对数据进行分组汇总,因此对样本量较小的情形,直方图会损失一部分信息,此时可以使用茎叶图来进行更精确的描述。茎叶图的形状与功能和直方图非常相似,但它是一种文本化的图形。使用函数 stem(salarym)
str <- "4000|4000|4000|4000|4000|3000|3000|3000|3000|5000|3000|5000|3000|6000|3000|3000|3000|3000|4000|3000|5000|6000|3000|6000|5000"
data1 <- unlist(strsplit(str, split="|", fixed=TRUE))
data2 <- as.double(data1)
#茎叶图
stem(data2)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 3 | 000000000000
## 4 | 000000
## 5 | 0000
## 6 | 000
对于分类数据还可以用饼图来描述。值得注意的是,与条形图一样,对原始数据作饼图前要先分组。
drink=c(3,4,1,1,3,4,3,3,1,3,2,1,2,1,2,3,2,3,1,1,1,1,4,3,1)
table(drink)
## drink
## 1 2 3 4
## 10 4 8 3
par(mfrow=c(1,2))
barplot(drink)
barplot(table(drink))
par(mfrow=c(1,2))
barplot(table(drink)/length(drink),col=1:4)
barplot(table(drink),col=c("red","yellow","blue","white"))
par(mfrow=c(1,1))
drink.count=table(drink)
par(mfrow=c(1,3))
pie(drink.count)
names(drink.count)=c("wine","wite","yellow","beer")
pie(drink.count)
pie(drink.count,col=c("purple","green","cyan","white"))
par(mfrow=c(1,1))
R语言里的densitis()函数可以画密度函数线。内置数据框faithful。下图红色线就是该分布函数的密度线,可以看出是个双峰型。
hist(faithful$eruptions, probability=T, breaks=25)
lines(density(faithful$eruptions), col='red')
下面将一些用于探索性分析的图形函数整合成一个拥有探索性数据分析功能的函数EDA,来对数据进行全面探索性分析:
EDA <- function (x) {
par(mfrow=c(2,2)) #同时作4个图(设置作图窗口为2行2列格式)
hist(x); #直方图|频率图:hist(x, probability=T)
dotchart(x) #点图
boxplot(x,horizontal=T); #箱式图|默认为是垂直型,设为水平型,则horizontal=T
qqnorm(x);qqline(x) #正态概率图
par(mfrow=c(1,1)) #恢复成单图
}
#oPath <- "C:/Users/abdata/Desktop/RegressionCS/"
#oName <- "hjg" #hjg
#oType <- "csv"
#oPathFile <- paste(oPath,oName,'.',oType,sep="",collapse="")
#data <- read.csv(oPathFile, header=T)
#data
#data1 <- unlist(strsplit(as.character(data[1,3]), split="|", fixed=TRUE))
#data1
#data2 <- as.double(data1)
#EDA(data2)
str <- "4356|4736|4121|4607|4386|3333|3095|3636|3947|4615|3739|4538|3521|4605|3671|3828|3095|3947|4000|3552|4054|4615|3552|4453|4356"
data1 <- unlist(strsplit(str, split="|", fixed=TRUE))
data2 <- as.double(data1)
EDA(data2)
str <- "4000|4000|4000|4000|4000|3000|3000|3000|3000|5000|3000|5000|3000|6000|3000|3000|3000|3000|4000|3000|5000|6000|3000|6000|5000"
data1 <- unlist(strsplit(str, split="|", fixed=TRUE))
data2 <- as.double(data1)
EDA1 <- function (x) {
par(mfrow=c(2,2)) #同时作4个图(设置作图窗口为2行2列格式)
#1.分类频数表(Table)
table(x)
#2.条图(Barplot)
#没有对Y轴数据进行分组的条形图
barplot(x)
#对Y轴数据进行分组的条形图
barplot(table(x))
#Y轴数据分组频率条形图
barplot(table(x)/length(x))
#Y轴数据分组频率颜色条形图
barplot(table(x), col=c("red","yellow","blue","white"))
par(mfrow=c(1,1)) #恢复成单图
}
EDA1(data2)
离群值(Qutlier)就是某个或少数几个值明显远离于其他大部分数据。理论上讲,离群值可以出现在各种分布中,但常见的主要出现在具有测量误差(Measurement Error)的数据或者总体是厚尾(Heavy-Tailed)分布的数据中。离群值的检验主要有箱线、Grubbs检验和Dixon’s Q检验。
boxplot.stats()
可以返回箱线图的有关统计量。其用法是:
#boxplot.stats(x, coef=1.5, do.conf=TRUE, do.out=TRUE)
salarym=c(2000,2100,2200,2300,2350,2450,2500,2700,2900,2850,15000,3500,3800,2600,3000,3300,3200,4000,3100,4200)
####outlier detection
boxplot(salarym)
boxplot.stats(salarym)
## $stats
## [1] 2000 2400 2875 3400 4200
##
## $n
## [1] 20
##
## $conf
## [1] 2521.701 3228.299
##
## $out
## [1] 15000
Grubbs检验是由Grubbs F.E. 提出来的用来探索来自于正态总体的单变量数据的离群值。Grubbs检验是基于正态总体的假设,也就是说,在做检验前需要先检验数据的正态性。Grubbs检验每次只能检验一个离群值。(!!!查资料!!!)
##Grubb test
#install.package("outliers")
library(outliers)
set.seed(5)
x=rnorm(10)
x
## [1] -0.84085548 1.38435934 -1.25549186 0.07014277 1.71144087
## [6] -0.60290798 -0.47216639 -0.63537131 -0.28577363 0.13810822
grubbs.test(x)
##
## Grubbs test for one outlier
##
## data: x
## G = 1.8799, U = 0.5637, p-value = 0.1881
## alternative hypothesis: highest value 1.71144087270237 is an outlier
grubbs.test(x,type=20)
##
## Grubbs test for two outliers
##
## data: x
## U = 0.18292, p-value = 0.02345
## alternative hypothesis: highest values 1.38435934347858 , 1.71144087270237 are outliers
grubbs.test(x,type=11)
##
## Grubbs test for two opposite outliers
##
## data: x
## G = 3.11540, U = 0.43194, p-value = 0.7015
## alternative hypothesis: -1.25549186262767 and 1.71144087270237 are outliers
Dixon’s Q检验也可以检验数据中是否存在离群值。
######Dixon Q test
set.seed(8)
x=rnorm(10)
x
## [1] -0.08458607 0.84040013 -0.46348277 -0.55083500 0.73604043
## [6] -0.10788140 -0.17028915 -1.08833171 -3.01105168 -0.59317433
dixon.test(x)
##
## Dixon test for outliers
##
## data: x
## Q = 0.51312, p-value = 0.0652
## alternative hypothesis: lowest value -3.01105167576798 is an outlier
dixon.test(x,opposite=TRUE)
##
## Dixon test for outliers
##
## data: x
## Q = 0.054108, p-value = 0.3471
## alternative hypothesis: highest value 0.840400125597476 is an outlier
dixon.test(x,type=10)
##
## Dixon test for outliers
##
## data: x
## Q = 0.49922, p-value = 0.0316
## alternative hypothesis: lowest value -3.01105167576798 is an outlier
我们经常面临着分析双变量数据之间关系的情形,如要分析人的身高和体重之间的关系;国家的财政收入和税收之间的关系;还有在药物临床试验中,新药是否比旧药好;当前的天气是否依赖于昨天的天气等等。
1.二维表
smoke=c("Y","N","N","Y","N","Y","Y","Y","N","Y")
study=c(1,2,2,3,3,1,2,1,3,2)
smoke=c("Y","N","N","Y","N","Y","Y","Y","N","Y")
study=c("<5h","5-10h","5-10h",">10h",">10h","<5h","5-10h","<5h",">10h","5-10h")
table(smoke,study)
## study
## smoke <5h >10h 5-10h
## N 0 2 2
## Y 3 1 2
tab=table(smoke,study)
prop.table(tab,1)
## study
## smoke <5h >10h 5-10h
## N 0.0000000 0.5000000 0.5000000
## Y 0.5000000 0.1666667 0.3333333
prop.table(tab,2)
## study
## smoke <5h >10h 5-10h
## N 0.0000000 0.6666667 0.5000000
## Y 1.0000000 0.3333333 0.5000000
prop.table(tab)
## study
## smoke <5h >10h 5-10h
## N 0.0 0.2 0.2
## Y 0.3 0.1 0.2
prop = function(x) x/sum(x)
apply(tab,2,prop)
## study
## smoke <5h >10h 5-10h
## N 0 0.6666667 0.5
## Y 1 0.3333333 0.5
t(apply(tab,1,prop))
## study
## smoke <5h >10h 5-10h
## N 0.0 0.5000000 0.5000000
## Y 0.5 0.1666667 0.3333333
2.复式条图
smoke=c("Y","N","N","Y","N","Y","Y","Y","N","Y")
study=c(1,2,2,3,3,1,2,1,3,2)
smoke=c("Y","N","N","Y","N","Y","Y","Y","N","Y")
study=c("<5h","5-10h","5-10h",">10h",">10h","<5h","5-10h","<5h",">10h","5-10h")
par(mfrow=c(1,3))
barplot(table(smoke,study))
barplot(table(study,smoke))
barplot(table(study,smoke), beside=T, legend.text=c("<5h","5-10h",">10"))
#legend()
par(mfrow=c(1,1))
1.箱型图
在实际生活中经常会碰到分成几种类型的数值型数据。如药物临床试验,有实验组和对照组两组数据。
x=c(5,5,13,7,11,11,9,8,9)
y=c(11,8,4,5,9,5,10,5,4,10)
#对变量x和y分别作箱线图,可以看出,x变量的均值要大于y变量的均值。且x、y变量都是偏态分布,x变量左偏,y变量右偏
boxplot(x,y)
d=c(5,5,5,13,7,11,11,9,8,9,11,8,4,5,9,5,10,5,4,10)
g=c(1,1,1, 1,1, 1, 1,1,1,1, 2,2,2,2,2,2, 2,2,2, 2)
boxplot(d~g) #值得注意的是d~g, 分类变量要在~后面。
2.散点图
x=c(5,5,13,7,11,11,9,8,9)
y=c(11,8,4,5,9,5,10,5,10)
#data.entry(c(NA)) #该语句是用数据编辑器输入数据
plot(x,y)
abline(lm(y~x))
3.相关系数
x=c(5,5,13,7,11,11,9,8,9)
y=c(11,8,4,5,9,5,10,5,10)
par(mfrow=c(1,2))
plot(x,y,pch=20)
plot(x,y);abline(lm(y~x))
abline(v=4500)
par(mfrow=c(1,1))
#cor()求的事pearson相关系数
cor(x,y)
## [1] -0.4173404
cor(y,x)
## [1] -0.4173404
#cor()函数也可以求spearman等级相关系数(秩相关系数)。
cor(x,y,method="spearman")
## [1] -0.4138085
#spearman相关是一种秩相关,可先对数据求秩,然后再计算它们的pearson相关。
cor(rank(x),rank(y))
## [1] -0.4138085
#这就是spearman等级相关系数。等同于 cor(x,y,method="spearman")
library(MASS)
data(Cars93)
dim(Cars93)
## [1] 93 27
attach(Cars93)
names(Cars93)
## [1] "Manufacturer" "Model" "Type"
## [4] "Min.Price" "Price" "Max.Price"
## [7] "MPG.city" "MPG.highway" "AirBags"
## [10] "DriveTrain" "Cylinders" "EngineSize"
## [13] "Horsepower" "RPM" "Rev.per.mile"
## [16] "Man.trans.avail" "Fuel.tank.capacity" "Passengers"
## [19] "Length" "Wheelbase" "Width"
## [22] "Turn.circle" "Rear.seat.room" "Luggage.room"
## [25] "Weight" "Origin" "Make"
price=cut(Price,c(0,12,20,max(Price)))
table(price)
## price
## (0,12] (12,20] (20,61.9]
## 22 40 31
levels(price)=c("cheap","okay","expensive")
table(price)
## price
## cheap okay expensive
## 22 40 31
mpg=cut(MPG.highway,c(0,20,30,max(MPG.highway)))
table(mpg)
## mpg
## (0,20] (20,30] (30,50]
## 2 62 29
levels(mpg)=c("gas guzzler","oky","miser")
table(mpg)
## mpg
## gas guzzler oky miser
## 2 62 29
table(Type)
## Type
## Compact Large Midsize Small Sporty Van
## 16 11 22 21 14 9
table(price,Type)
## Type
## price Compact Large Midsize Small Sporty Van
## cheap 3 0 0 18 1 0
## okay 9 3 8 3 9 8
## expensive 4 8 14 0 4 1
table(price,mpg)
## mpg
## price gas guzzler oky miser
## cheap 0 5 17
## okay 2 26 12
## expensive 0 31 0
table(price,Type,mpg)
## , , mpg = gas guzzler
##
## Type
## price Compact Large Midsize Small Sporty Van
## cheap 0 0 0 0 0 0
## okay 0 0 0 0 0 2
## expensive 0 0 0 0 0 0
##
## , , mpg = oky
##
## Type
## price Compact Large Midsize Small Sporty Van
## cheap 1 0 0 4 0 0
## okay 5 3 6 0 6 6
## expensive 4 8 14 0 4 1
##
## , , mpg = miser
##
## Type
## price Compact Large Midsize Small Sporty Van
## cheap 2 0 0 14 1 0
## okay 4 0 2 3 3 0
## expensive 0 0 0 0 0 0
detach()
library(MASS)
data(Cars93)
dim(Cars93)
## [1] 93 27
attach(Cars93)
par(mfrow=c(1,2))
barplot(table(Type,price))
barplot(table(Type,price),beside=T)
par(mfrow=c(1,1))
detach()
library(MASS)
data(Cars93)
dim(Cars93)
## [1] 93 27
attach(Cars93)
boxplot(Price~Type)
r1=rnorm(1000)
f1=factor(rep(1:10,100))
boxplot(r1~f1)
detach()
#library(graphics)
library(MASS)
data(Cars93)
dim(Cars93)
## [1] 93 27
attach(Cars93)
#stripchart(Type~Price)
r2=rnorm(100)
f2=factor(rep(1:5,20))
stripchart(r2~f2)
detach()
#iris
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
levels(iris$Species) #种类 Species 的水平
## [1] "setosa" "versicolor" "virginica"
iris.lab = rep(c("1","2","3"),rep(50,3))
# 重叠散点图
par(mfrow=c(1,2))
plot(iris[,1],iris[,3],type="n") #不显示type="n"
text(iris[,1],iris[,3],cex=0.6) #显示样本序号,缩小字体cex=0.6
plot(iris[,1],iris[,3],type="n")
text(iris[,1],iris[,3],iris.lab,cex=0.7) #显示分类号,缩小字体cex=0.7
par(mfrow=c(1,1))
#矩阵式散点图
pairs(iris)
pairs(iris[,1:4])
pairs(iris[1:4],pch=21,bg=iris.lab) #按iris.lab分类
pairs(iris[1:4],pch=21, bg=c("red", "green3", "blue")[unclass(iris$Species)])