관리 메뉴

moozi

script1.R 본문

TIS_2019/R데이터분석_2기

script1.R

moozi 2019. 11. 2. 09:49

a<-1
b<-2
a+b
c<-3
a+b+c
4/b
2*c

var1<-c(10,20,30,40,50)
var1

var2<-c(1:5)
var2

var3<-seq(1,10,by=2)
var3

var4<-seq(1,10,by=3)
var4


v1<-c(1,2,3,4,5)
v2<-c(10,20,30,40,50)
v1+v2


str5<-c("Hello","World","is","good")
str5

score<-c(100,80,70,50)
mean(score)
min(score)
max(score)

data<-c("홍길동","서울시","010-1111-2222")
paste(data,collapse=",")

install.packages("ggplot2")

library(ggplot2)

x<-c("a","a","b","c","e","e","e","f")
qplot(x)
qplot(data=mpg,x=cty)
qplot(data=mpg,x=drv,y=hwy,geom="boxplot",colour=drv)



english<-c(100,90,80,50)
math<-c(70,90,80,100)
df_midterm<-data.frame(english,math)
df_midterm
mean(df_midterm$english)
mean(df_midterm$math)

df_midterm<-data.frame(english=c(90,80,60,70),math=c(50,60,100,20),class=c(1,1,2,2))
df_midterm

install.packages("readxl")
library(readxl)

df_frame<-read_excel("excel_exam.xlsx")
df_frame
df_frame$total<-(df_frame$math+df_frame$english+df_frame$science)
df_frame$avg<-(df_frame$total/3)
df_frame






save(df_frame,file="df_midterm.rda")

load("df_midterm.rda")
df_midterm
mean(df_midterm$english)


mean(df_frame$english)
mean(df_frame$math)

df_exam_novar<-read_excel("excel_exam_novar.xlsx",col_names=F)
df_exam_novar
write.csv(df_exam_novar,file="df_exam_novar.csv")





df_exam_sheet<-read_excel("excel_exam_sheet.xlsx",sheet=3)
df_exam_sheet

df_csv_exam<-read.csv("csv_exam.csv")
df_csv_exam

df_midterm<-data.frame(english=c(90,80,60,70), 
                       math=c(50,60,100,20),
                       class=c(1,1,2,2))
df_midterm
write.csv(df_midterm,file="df_midterm.csv")

exam<-read.csv("csv_exam.csv")
exam
head(exam)
head(exam,10)
tail(exam)
tail(exam,10)
View(exam)

dim(exam)

str(exam)

summary(exam)


?mpg

mpg<-as.data.frame(ggplot2::mpg)
dim(mpg)





mpg

head(mpg)
tail(mpg)

View(mpg)
summary(mpg)


df_raw<-data.frame(var1=c(1,2,1),
                   var2=c(2,3,2))
df_raw

install.packages("dplyr")
library(dplyr)

df_new<-rename(df_raw,v2=var2)
df_new

df_copy<-df_new
df_copy


df<-data.frame(var1=c(10,20,30),
               var2=c(100,200,300))
df$sum<-df$var1+df$var2
df

mpg$total<-(mpg$cty+mpg$hwy)/2
head(mpg)

mpg$test<-ifelse(mpg$total>=20,"pass","fail")
tail(mpg)

mpg$grade<-ifelse(mpg$total>=30,"A",
                 ifelse(mpg$total>=20,"B","C"))
head(mpg,20)

hist(mpg$total)

library(ggplot2)
qplot(data=mpg,x=total,colour=total)

table(mpg$grade)

mpg$grade2<-ifelse(mpg$total>=30,"A",
                  ifelse(mpg$total>=25,"B",
                         ifelse(mpg$total>=20,"C","D>")))
head(mpg)


library(dplyr)
exam<-read.csv("csv_exam.csv")
exam

exam %>% filter(class!=2)

exam %>% filter(math>50)

exam %>% filter(class==2 & math==100)

exam %>% filter(math>=90 | english>=90)

exam %>% filter(class==1|class==3|class==5)
exam %>% filter(class %in% c(1,3,5))

exam %>% filter(class!=2 & class!=4)

class1<-exam %>% filter(class==1)
class2<-exam %>% filter(class==2)

class1
class2

7%%2

result<-ifelse(7%%2==1,"홀수","짝수")
result

exam %>% select(class,math,english)

exam %>% 
  select(english)  %>%  
  head(10)



install.packages("dplyr")
library(dplyr)
exam<-read.csv("csv_exam.csv")
exam

exam %>% arrange(math)

exam %>% arrange(class,desc(math))

exam %>% mutate(total=math+english+science) %>% head


exam2<-exam %>% mutate(total=math+english+science,
                mean=(math+english+science)/3) %>%
     head
exam2

exam %>% group_by(class) %>%
        summarise(mean_math=mean(math),
                  sum_math=sum(math),
                  median_math=median(math),
                  count=n())


mpg %>% group_by(manufacturer,drv) %>%
        summarise(mean_cty=mean(cty)) %>%
        head(10)


mpg %>% group_by(manufacturer) %>%
        filter(class=="suv") %>%
        mutate(tot=(cty+hwy)/2) %>%
        summarise(mean_tot=mean(tot)) %>%
        arrange(desc(mean_tot))

test1 <- data.frame(id=c(1,2,3,4,5),
                    midterm=c(60,80,70,90,85))
test1

test2 <- data.frame(id=c(1,2,3,4,5),
                    final=c(70,83,65,95,80))
test2

total<- left_join(test1,test2, by="id")
total

name<-data.frame(class=c(1,2,3,4,5),
                 teacher=c("kim","lee","park","choi","jung"))

name

exam<-read.csv("csv_exam.csv")
exam

exam_new <- left_join(exam,name,by="class")
exam_new


group_a<-data.frame(id=c(1,2,3,4,5),
                    test=c(60,80,70,90,85))
group_a

group_b<-data.frame(id=c(6,7,8,9,10),
                    test=c(70,83,65,95,80))
group_b

group_all<-bind_rows(group_a,group_b)
group_all

group_c<-data.frame(id=c(11,12,13,14,15),
                    test=c(70,83,65,95,80))
group_c


group_d<-data.frame(id2=c(16,17,18,19,20),
                    test2=c(70,83,65,95,80))
group_d

group_all2<-bind_rows(group_a,group_b,group_c)
group_all2

group_all3<-bind_rows(group_a,group_b,group_c,group_d)
group_all3


df<-data.frame(sex=c("M","F",NA,"M","F"),
               score=c(5,4,3,4,NA))
df

df_nomiss<-df%>%filter(!is.na(score)&!is.na(sex))
df_nomiss

m<-is.na(df)
m


exam<-read.csv("csv_exam.csv")
exam[c(3,8,15),"math"]<-NA
exam

exam %>% summarise(mean_math=mean(math,na.rm=T),
                   sum_math=sum(math,na.rm=T),
                   median_math=median(math,na.rm=T))

mean(exam$math,na.rm=T)


exam<-read.csv("csv_exam.csv")
exam[c(3,8,15),"math"]<-NA
exam

exam$math<-ifelse(is.na(exam$math),
                  mean(exam$math,na.rm=T),
                  exam$math)
exam


outlier<-data.frame(sex=c(1,2,1,3,2,1),
                    score=c(5,4,3,4,2,6))
outlier

outlier$sex<-ifelse(outlier$sex==3,NA,outlier$sex)
outlier

outlier$score<-ifelse(outlier$score>5,NA,outlier$score)
outlier

out<-outlier %>% filter(!is.na(sex)&!is.na(score)) %>%
            group_by(sex) %>%
            summarise(mean_score=mean(score))
out

boxplot(mpg$hwy)$stats

mpg$hwy<-ifelse(mpg$hwy<12|mpg$hwy>37,NA,mpg$hwy)

mpg %>% filter()



mpg %>% filter(is.na(mpg$hwy))

table(mpg$hwy)

mpg %>% group_by(drv) %>%
  summarise(mean_hwy=mean(hwy,na.rm=T))


install.packages("ggplot2")
library(ggplot2)

mpg %>% filter(is.na(mpg$hwy))
??mpg

library(ggplot2)

ggplot(data=mpg,aes(x=displ,y=hwy))+geom_point()+xlim(3,6)

library(dplyr)
df_mpg<-mpg %>% filter(!is.na(mpg$hwy))  %>%
            group_by(drv) %>%
            summarise(mean_hwy=mean(hwy))
df_mpg      

ggplot(data=df_mpg,aes(x=drv,y=mean_hwy))+geom_col()
ggplot(data=mpg,aes(x=hwy))+geom_bar()



mpg<-read.csv("mpg.csv")


ggplot(data=economics,aes(x=date,y=unemploy))+geom_line()

ggplot(data=mpg,aes(x=drv,y=hwy))+geom_boxplot()

?economics



install.packages("foreign")

library(foreign) #spss파일 처리
library(dplyr)   #전처리 함수 사용 %>%
library(ggplot2) #그래프처리
library(readxl)  #엑셀읽기

raw_welfare<-read.spss(file="Koweps_hpc10_2015_beta1.sav",to.data.frame = T)
welfare<-raw_welfare

summary(welfare)
head(welfare)


welfare<-rename(welfare,
                sex=h10_g3,
                birth=h10_g4,
                marriage=h10_g10,
                religion=h10_g11,
                income=p1002_8aq1,
                code_job=h10_eco9,
                code_region=h10_reg7)
head(welfare)

table(welfare$sex)

welfare %>% group_by(sex) %>% summarise(count=n())

welfare$sex <- ifelse(welfare$sex ==9, NA, welfare$sex)

welfare$sex <- ifelse(welfare$sex==1,"male","female")

qplot(welfare$sex)
qplot(welfare$incom)

welfare$income<-ifelse(welfare$income %in% c(0,9999),NA,welfare$income)

table(is.na(welfare$income))
welfare %>% filter(is.na(income)) %>% summarise(count=n())

sex_income<-welfare %>% filter(!is.na(income)) %>% group_by(sex) %>%
                      summarise(mean_income=mean(income))
sex_income<-welfare %>% group_by(sex) %>%
                        summarise(mean_income=mean(income,na.rm = T))
sex_income

ggplot(data=sex_income,aes(x=sex,y=mean_income))+geom_col()

welfare$age<- 2019-welfare$birth + 1

qplot(welfare$age)

age_income<-welfare %>% filter(!is.na(income))%>%
                    group_by(age) %>%
                    summarise(mean_income=mean(income))

head(age_income)

ggplot(data=age_income,aes(x=age,y=mean_income))+geom_line()

welfare<-welfare %>% mutate(ageg=ifelse(age<30,"young", 
                                        ifelse(age<=59,"middle","old")))
qplot(welfare$ageg)


welfare<-welfare %>% mutate(ageg=ifelse(age<10,"child", 
                                 ifelse(age<20,"10",
                                 ifelse(age<30,"20",
                                 ifelse(age<40,"30",
                                 ifelse(age<50,"40",
                                 ifelse(age<60,"50",
                                 ifelse(age<70,"60",
                                 ifelse(age<80,"70",
                                 ifelse(age<90,"80","older"))))))))))
qplot(welfare$ageg)+scale_x_discrete(limits=c("child","10","20","30","40","50","60","70","80","older"))

sex_income<-welfare %>% filter(!is.na(income)) %>%
                      group_by(ageg,sex) %>%
                      summarise(mean_income=mean(income))

sex_income

ggplot(data=sex_income,aes(x=ageg,y=mean_income,fill=sex))+geom_col(position = "dodge")+scale_x_discrete(limits=c("young","middle","old"))

sex_age<-welfare %>% filter(!is.na(income)) %>%
  group_by(age,sex) %>%
  summarise(mean_income=mean(income))

head(sex_age)
  
ggplot(data=sex_age,aes(x=age,y=mean_income,col=sex))+geom_line()


sex_ageg<-welfare %>% filter(!is.na(income)) %>%
  group_by(ageg,sex) %>%
  summarise(mean_income=mean(income))
sex_ageg

ggplot(data=sex_ageg,aes(x=ageg,y=mean_income,col=sex))+geom_col(position = "dodge")


library(readxl)

list_job<-read_excel("Koweps_Codebook.xlsx",col_names=T,sheet=2)
head(list_job)

welfare<-left_join(welfare,list_job,id="code_job")

head(welfare)

welfare %>% filter(!is.na(code_job)) %>%
        select(code_job,job) %>%
        head(10)

job_income<-welfare %>% filter(!is.na(job)&!is.na(income))%>%
  group_by(job) %>% summarise(mean_income=mean(income))

head(job_income)


top10<-job_income %>% arrange(desc(mean_income)) %>%
  head(10)

top10

ggplot(data=top10,aes(x=reorder(job,mean_income),y=mean_income))+geom_col()+coord_flip()


bottom10 <- job_income %>%
          arrange(mean_income) %>%
          head(10)

ggplot(data=bottom10,aes(x=reorder(job,-mean_income),y=mean_income))+geom_col()+coord_flip()+ylim(0,150)


job_male <- welfare %>%
  filter(!is.na(job)&sex=="male") %>%
  group_by(job) %>%
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  head(10)

job_male

job_female <- welfare %>%
  filter(!is.na(job)&sex=="female") %>%
  group_by(job) %>%
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  head(10)

job_female


job_gender <- welfare %>%
  filter(!is.na(job)) %>%
  group_by(job,sex) %>%
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  head(10)

job_gender

ggplot(data=job_gender,aes(x=reorder(job,n),y=n))+geom_col()+coord_flip()


install.packages("graphics")
library(graphics)

x<-c(9,15,20,6)
label<-c("영업1팀","영업2팀","영업3팀","영업4팀")

pie(x,labels=label,main="부서별 영업 실적")

install.packages("prlotrix")
library(plotrix)


pie3D(x, labels=label, explode=0.1, labelcex = 0.7, main="부서별 영업 실적")


library(dplyr)

m_cnt <- welfare %>%
  filter(!is.na(marriage)) %>%
  group_by(marriage) %>%
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  head(10)

m_cnt

pie3D(m_cnt$n, labels=m_cnt$marriage, explode=0.1, labelcex = 0.7, main="결혼형태집계")


welfare$religion<-ifelse(welfare$religion==1,"yes","no")

welfare$group_marriage<-ifelse(welfare$marriage==1,"marriage",
                               ifelse(welfare$marriage==3,"divorce",NA))

religion_marriage<-welfare %>% filter(!is.na(group_marriage)) %>%
                              group_by(religion,group_marriage) %>%
                              summarise(n=n()) %>%
                              mutate(tot_group=sum(n)) %>%
                              mutate(pct=round(n/tot_group*100,1))

religion_marriage


pie3D(religion_marriage$n, labels=paste(religion_marriage$religion,religion_marriage$group_marriage," "), explode=0.1, labelcex = 0.7, main="종교유무별 이혼률")






'TIS_2019 > R데이터분석_2기' 카테고리의 다른 글

R 카카오맵  (0) 2019.11.09
인구이동 통계 워드클라우드  (0) 2019.11.09
oracle table to excel  (0) 2019.11.02
분석데이터  (0) 2019.11.02
mpg.csv  (0) 2019.10.26
Comments