data preparation.R 2.45 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108


#set the working directory
#make it be the data file
setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data")


datafiles <- list.files()
output <- data.frame()

for (i in seq_along(datafiles)){
  
  #load the data
  dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F)
  
  #find the row number that contain "^SAMPLE"
  sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) 
  
  #find the row number that contain "!sample_table_begin"
  begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) 
  
  #find the row number that contain "!sample_table_end"
  end <- grep("!sample_table_end", dataset[,1],fixed = TRUE)
  
  #find the row number that contain "sex"
  sex <- grep("= Sex: ", dataset[,1],fixed = TRUE)
  
  #find the row number that contain "age"
  age <- grep("= age: ", dataset[,1],fixed = TRUE)
  
  #find the row number that contain ""
  #you can define whatever other information you want in the data

  
  #find how many the platform_id
  platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE)
  
  
  for (j in seq_along(sampleID)) {
    #get the data
    tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2]
    tmp_data[,2] <- as.numeric(tmp_data[,2])
    
    #get the exact information
    #get the sample#
    sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE)
    #get the platform
    platform <- sub("!Sample_platform_id = ", 
                    "", dataset[platID[j],1],fixed = TRUE)
    #get the gender
    gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2]
    #get the age
    Age <- strsplit(dataset[age[j],1], ": ")[[1]][2]
    Age <- as.numeric(Age)
    
    #combine the data into temple data
    n <- nrow(tmp_data)
    tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n), 
                      rep(gender, n), rep(Age, n))
    
    #combine the temple data into output
    output <- rbind(output, tmp_data)
    
  }
  
  
  #check how many the platform_id
#  platform <- dataset[platID, 1]
  #  n_plat <- length(unique(platform))
  
  
  #  if (n_plat == 1){
  #   for (j in seq_along(sampleID)) {
      
      #get the information for one sample
  #    infor <- dataset[sampleID[j]:(begin[j]-1),1]
  #   id <- infor
      
  # }
  #  } else{
    
  #  }
  
}

#name the columns
names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age")

#save the output into local
setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning")
write.csv(output, "output.csv", row.names = F)