library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(readxl)library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(readxl)data <- read_excel("file1_dummy_patient_data_R_workshop.xlsx")dim(data)[1] 287230 9
nrow(data)[1] 287230
ncol(data)[1] 9
#number of duplicated row
sum(duplicated(data))[1] 2831
# Square bracket indexing to select rows and columns
# df[row_index,column_index]
data[1:5,]# A tibble: 5 × 9
id_patient birthdate sex test_date location
<chr> <dttm> <chr> <dttm> <chr>
1 205XDFDX95 1964-01-12 00:00:00 Male 2024-02-09 12:01:41 Gujranwala
2 205XDFDX95 1964-01-13 00:00:00 Male 2024-02-09 12:01:41 Gujranwala
3 205XDFDX95 1964-01-14 00:00:00 Male 2024-02-09 12:01:41 Gujranwala
4 205XDFDX95 1964-01-15 00:00:00 Male 2024-02-09 12:01:41 Gujranwala
5 199XDFDX53 1964-01-16 00:00:00 Male 2024-11-06 03:16:28 Mardan
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
# result_detail <chr>, result_category <chr>
#to see duplicate data
data[duplicated(data), ]# A tibble: 2,831 × 9
id_patient birthdate sex test_date location
<chr> <dttm> <chr> <dttm> <chr>
1 176XDFDX98 1935-02-26 00:00:00 Female 2024-02-04 00:17:34 Lahore
2 240XDFDX13 1935-02-26 00:00:00 Female 2024-08-22 17:04:02 Muzaffarabad
3 185XDFDX92 1935-03-12 00:00:00 Male 2024-10-31 14:56:58 Faisalabad
4 241XDFDX75 1935-03-15 00:00:00 Female 2025-01-04 10:09:33 Bannu
5 240XDFDX40 1935-04-04 00:00:00 Female 2024-10-27 03:32:13 Peshawar
6 118XDFDX38 1935-04-23 00:00:00 Female 2024-05-20 20:35:23 Islamabad
7 176XDFDX84 1935-05-01 00:00:00 Female 2024-01-11 20:47:42 Abbottabad
8 245XDFDX57 1935-05-28 00:00:00 Female 2024-12-07 14:38:04 Peshawar
9 162XDFDX82 1935-06-02 00:00:00 Female 2024-09-12 22:08:01 Mirpur
10 248XDFDX36 1935-06-28 00:00:00 Male 2024-05-24 17:27:38 Hyderabad
# ℹ 2,821 more rows
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
# result_detail <chr>, result_category <chr>
# duplicate data including the first occurrence
data[duplicated(data) | duplicated(data, fromLast = TRUE), ]# A tibble: 5,629 × 9
id_patient birthdate sex test_date location
<chr> <dttm> <chr> <dttm> <chr>
1 176XDFDX98 1935-02-26 00:00:00 Female 2024-02-04 00:17:34 Lahore
2 176XDFDX98 1935-02-26 00:00:00 Female 2024-02-04 00:17:34 Lahore
3 240XDFDX13 1935-02-26 00:00:00 Female 2024-08-22 17:04:02 Muzaffarabad
4 240XDFDX13 1935-02-26 00:00:00 Female 2024-08-22 17:04:02 Muzaffarabad
5 185XDFDX92 1935-03-12 00:00:00 Male 2024-10-31 14:56:58 Faisalabad
6 185XDFDX92 1935-03-12 00:00:00 Male 2024-10-31 14:56:58 Faisalabad
7 241XDFDX75 1935-03-15 00:00:00 Female 2025-01-04 10:09:33 Bannu
8 241XDFDX75 1935-03-15 00:00:00 Female 2025-01-04 10:09:33 Bannu
9 240XDFDX40 1935-04-04 00:00:00 Female 2024-10-27 03:32:13 Peshawar
10 240XDFDX40 1935-04-04 00:00:00 Female 2024-10-27 03:32:13 Peshawar
# ℹ 5,619 more rows
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
# result_detail <chr>, result_category <chr>
# remove duplicated data
data_unique <- unique(data)dim(data_unique)[1] 284399 9
rename variable
names(data_unique)[names(data_unique)=="date of test"] <- "test_date"
names(data_unique)[names(data_unique)=="result detail"] <- "result_detail"sum(is.na(data_unique))[1] 16
colSums(is.na(data_unique)) id_patient birthdate sex
0 0 0
test_date location test_procedure_name
0 16 0
test_procedure_result result_detail result_category
0 0 0
# to see the missing values
data_unique[rowSums(is.na(data_unique)) > 0, ]# A tibble: 16 × 9
id_patient birthdate sex test_date location
<chr> <dttm> <chr> <dttm> <chr>
1 249XDFDX36 1937-09-21 00:00:00 Male 2024-05-03 17:53:30 <NA>
2 249XDFDX36 1937-09-21 00:00:00 Male 2024-05-03 17:53:30 <NA>
3 167XDFDX58 1937-12-12 00:00:00 Male 2024-04-13 13:56:36 <NA>
4 128XDFDX77 1938-10-27 00:00:00 Female 2025-01-08 14:46:25 <NA>
5 197XDFDX65 1940-01-07 00:00:00 Female 2024-08-22 19:27:36 <NA>
6 249XDFDX42 1943-08-01 00:00:00 Female 2024-08-12 13:40:21 <NA>
7 183XDFDX93 1953-11-04 00:00:00 Male 2024-04-08 09:23:12 <NA>
8 213XDFDX72 1954-05-17 00:00:00 Male 2024-08-01 22:41:19 <NA>
9 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>
10 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>
11 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>
12 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>
13 180XDFDX42 1995-05-02 00:00:00 Male 2024-12-24 15:55:21 <NA>
14 161XDFDX16 1998-05-21 00:00:00 Female 2024-10-02 06:06:55 <NA>
15 197XDFDX53 2004-09-28 00:00:00 Female 2024-04-24 06:24:55 <NA>
16 166XDFDX89 2008-02-21 00:00:00 Female 2024-09-11 23:07:36 <NA>
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
# result_detail <chr>, result_category <chr>
data_unique[!complete.cases(data_unique), ]# A tibble: 16 × 9
id_patient birthdate sex test_date location
<chr> <dttm> <chr> <dttm> <chr>
1 249XDFDX36 1937-09-21 00:00:00 Male 2024-05-03 17:53:30 <NA>
2 249XDFDX36 1937-09-21 00:00:00 Male 2024-05-03 17:53:30 <NA>
3 167XDFDX58 1937-12-12 00:00:00 Male 2024-04-13 13:56:36 <NA>
4 128XDFDX77 1938-10-27 00:00:00 Female 2025-01-08 14:46:25 <NA>
5 197XDFDX65 1940-01-07 00:00:00 Female 2024-08-22 19:27:36 <NA>
6 249XDFDX42 1943-08-01 00:00:00 Female 2024-08-12 13:40:21 <NA>
7 183XDFDX93 1953-11-04 00:00:00 Male 2024-04-08 09:23:12 <NA>
8 213XDFDX72 1954-05-17 00:00:00 Male 2024-08-01 22:41:19 <NA>
9 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>
10 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>
11 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>
12 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>
13 180XDFDX42 1995-05-02 00:00:00 Male 2024-12-24 15:55:21 <NA>
14 161XDFDX16 1998-05-21 00:00:00 Female 2024-10-02 06:06:55 <NA>
15 197XDFDX53 2004-09-28 00:00:00 Female 2024-04-24 06:24:55 <NA>
16 166XDFDX89 2008-02-21 00:00:00 Female 2024-09-11 23:07:36 <NA>
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
# result_detail <chr>, result_category <chr>
data_unique$location[is.na(data_unique$location)] <- "unknown"sum(is.na(data_unique))[1] 0
unique(data_unique$sex)[1] "Male" "Female" "M" "F" "MALE"
sum(data_unique$sex =="Female")[1] 134053
sum(data_unique$sex =="Male")[1] 145128
sum(data_unique$sex =="M")[1] 1109
sum(data_unique$sex =="F")[1] 3790
sum(data_unique$sex =="MALE")[1] 319
data_unique$sex <- tolower(data_unique$sex)unique(data_unique$sex)[1] "male" "female" "m" "f"
data_unique$sex[data_unique$sex =="m"] <- "male"
data_unique$sex[data_unique$sex =="f"] <- "female"sum(data_unique$sex =="Female")[1] 0
sum(data_unique$sex =="Male")[1] 0
sum(data_unique$sex =="M")[1] 0
sum(data_unique$sex =="F")[1] 0
sum(data_unique$sex =="MALE")[1] 0
sum(data_unique$sex =="male")[1] 146556
sum(data_unique$sex =="female")[1] 137843
First we will create a new variable age from birth date and date of test.
class(data_unique$birthdate)[1] "POSIXct" "POSIXt"
class(data_unique$test_date)[1] "POSIXct" "POSIXt"
# convert birthdate and test_date into date formate, defualt formate is YYYY-MM-DD, if your data is in different formate than specify formate using as.Date(birthdate, format = "%d-%m-%Y")
data_unique <- data_unique %>%
mutate(
birthdate = as.Date(birthdate),
test_date = as.Date(test_date)
)library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
data_unique <- data_unique %>%
mutate(age = as.numeric(difftime(test_date, birthdate, units = "days")) / 365.25)colnames(data_unique) [1] "id_patient" "birthdate" "sex"
[4] "test_date" "location" "test_procedure_name"
[7] "test_procedure_result" "result_detail" "result_category"
[10] "age"
# Mean age for males
data_unique %>%
filter(sex == "male") %>%
summarise(mean_age = mean(age, na.rm = TRUE))# A tibble: 1 × 1
mean_age
<dbl>
1 51.1
# Mean age for females
data_unique %>%
filter(sex == "female") %>%
summarise(mean_age = mean(age, na.rm = TRUE))# A tibble: 1 × 1
mean_age
<dbl>
1 51.0
data_unique %>%
group_by(sex) %>%
summarise(mean_age = mean(age, na.rm = TRUE))# A tibble: 2 × 2
sex mean_age
<chr> <dbl>
1 female 51.0
2 male 51.1
data_unique %>%
filter(test_procedure_name == "Hemoglobin") %>%
mutate(test_procedure_result = as.numeric(test_procedure_result)) %>%
group_by(sex) %>%
summarise(mean_hemoglobin = mean(test_procedure_result))# A tibble: 2 × 2
sex mean_hemoglobin
<chr> <dbl>
1 female 14.9
2 male 15.1
unique(data_unique$test_procedure_name)[1] "Hemoglobin" "Blood Culture" "Histopathology Report"
[4] "Blood Glucose"
# write cleaned data into new excel file
# write_xlsx(data_unique,"cleaned_dummy_patient_data_R_workshop.xlsx")data <- data %>%
mutate(
result_category = case_when(
test_procedure_result < 70 &
test_procedure_name == "Blood Glucose" ~ "hypoglycemia",
test_procedure_result > 125 &
test_procedure_name == "Blood Glucose" ~ "hyperglycemia",
test_procedure_result < 12 &
test_procedure_name == "Hemoglobin" ~ "anemia",
test_procedure_result > 18 &
test_procedure_name == "Hemoglobin" ~ "polycythemia",
TRUE ~ "normal"
)
)