Data Cleaning and Preparation in R

Author

Hassan Ghayas

Published

April 26, 2026

📦 Load Libraries

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(readxl)

📂 Load dataset

data <- read_excel("file1_dummy_patient_data_R_workshop.xlsx")
dim(data)
[1] 287230      9
nrow(data)
[1] 287230
ncol(data)
[1] 9

Remove duplicated data

#number of duplicated row
sum(duplicated(data))
[1] 2831
# Square bracket indexing to select rows and columns
# df[row_index,column_index]
data[1:5,]
# A tibble: 5 × 9
  id_patient birthdate           sex   test_date           location  
  <chr>      <dttm>              <chr> <dttm>              <chr>     
1 205XDFDX95 1964-01-12 00:00:00 Male  2024-02-09 12:01:41 Gujranwala
2 205XDFDX95 1964-01-13 00:00:00 Male  2024-02-09 12:01:41 Gujranwala
3 205XDFDX95 1964-01-14 00:00:00 Male  2024-02-09 12:01:41 Gujranwala
4 205XDFDX95 1964-01-15 00:00:00 Male  2024-02-09 12:01:41 Gujranwala
5 199XDFDX53 1964-01-16 00:00:00 Male  2024-11-06 03:16:28 Mardan    
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
#   result_detail <chr>, result_category <chr>
#to see duplicate data
data[duplicated(data), ]
# A tibble: 2,831 × 9
   id_patient birthdate           sex    test_date           location    
   <chr>      <dttm>              <chr>  <dttm>              <chr>       
 1 176XDFDX98 1935-02-26 00:00:00 Female 2024-02-04 00:17:34 Lahore      
 2 240XDFDX13 1935-02-26 00:00:00 Female 2024-08-22 17:04:02 Muzaffarabad
 3 185XDFDX92 1935-03-12 00:00:00 Male   2024-10-31 14:56:58 Faisalabad  
 4 241XDFDX75 1935-03-15 00:00:00 Female 2025-01-04 10:09:33 Bannu       
 5 240XDFDX40 1935-04-04 00:00:00 Female 2024-10-27 03:32:13 Peshawar    
 6 118XDFDX38 1935-04-23 00:00:00 Female 2024-05-20 20:35:23 Islamabad   
 7 176XDFDX84 1935-05-01 00:00:00 Female 2024-01-11 20:47:42 Abbottabad  
 8 245XDFDX57 1935-05-28 00:00:00 Female 2024-12-07 14:38:04 Peshawar    
 9 162XDFDX82 1935-06-02 00:00:00 Female 2024-09-12 22:08:01 Mirpur      
10 248XDFDX36 1935-06-28 00:00:00 Male   2024-05-24 17:27:38 Hyderabad   
# ℹ 2,821 more rows
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
#   result_detail <chr>, result_category <chr>
# duplicate data including the first occurrence
data[duplicated(data) | duplicated(data, fromLast = TRUE), ]
# A tibble: 5,629 × 9
   id_patient birthdate           sex    test_date           location    
   <chr>      <dttm>              <chr>  <dttm>              <chr>       
 1 176XDFDX98 1935-02-26 00:00:00 Female 2024-02-04 00:17:34 Lahore      
 2 176XDFDX98 1935-02-26 00:00:00 Female 2024-02-04 00:17:34 Lahore      
 3 240XDFDX13 1935-02-26 00:00:00 Female 2024-08-22 17:04:02 Muzaffarabad
 4 240XDFDX13 1935-02-26 00:00:00 Female 2024-08-22 17:04:02 Muzaffarabad
 5 185XDFDX92 1935-03-12 00:00:00 Male   2024-10-31 14:56:58 Faisalabad  
 6 185XDFDX92 1935-03-12 00:00:00 Male   2024-10-31 14:56:58 Faisalabad  
 7 241XDFDX75 1935-03-15 00:00:00 Female 2025-01-04 10:09:33 Bannu       
 8 241XDFDX75 1935-03-15 00:00:00 Female 2025-01-04 10:09:33 Bannu       
 9 240XDFDX40 1935-04-04 00:00:00 Female 2024-10-27 03:32:13 Peshawar    
10 240XDFDX40 1935-04-04 00:00:00 Female 2024-10-27 03:32:13 Peshawar    
# ℹ 5,619 more rows
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
#   result_detail <chr>, result_category <chr>
# remove duplicated data
data_unique <- unique(data)
dim(data_unique)
[1] 284399      9

rename variable

names(data_unique)[names(data_unique)=="date of test"] <- "test_date"
names(data_unique)[names(data_unique)=="result detail"] <- "result_detail"

Missing value

sum(is.na(data_unique))
[1] 16
colSums(is.na(data_unique))
           id_patient             birthdate                   sex 
                    0                     0                     0 
            test_date              location   test_procedure_name 
                    0                    16                     0 
test_procedure_result         result_detail       result_category 
                    0                     0                     0 
# to see the missing values
data_unique[rowSums(is.na(data_unique)) > 0, ]
# A tibble: 16 × 9
   id_patient birthdate           sex    test_date           location
   <chr>      <dttm>              <chr>  <dttm>              <chr>   
 1 249XDFDX36 1937-09-21 00:00:00 Male   2024-05-03 17:53:30 <NA>    
 2 249XDFDX36 1937-09-21 00:00:00 Male   2024-05-03 17:53:30 <NA>    
 3 167XDFDX58 1937-12-12 00:00:00 Male   2024-04-13 13:56:36 <NA>    
 4 128XDFDX77 1938-10-27 00:00:00 Female 2025-01-08 14:46:25 <NA>    
 5 197XDFDX65 1940-01-07 00:00:00 Female 2024-08-22 19:27:36 <NA>    
 6 249XDFDX42 1943-08-01 00:00:00 Female 2024-08-12 13:40:21 <NA>    
 7 183XDFDX93 1953-11-04 00:00:00 Male   2024-04-08 09:23:12 <NA>    
 8 213XDFDX72 1954-05-17 00:00:00 Male   2024-08-01 22:41:19 <NA>    
 9 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>    
10 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>    
11 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>    
12 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>    
13 180XDFDX42 1995-05-02 00:00:00 Male   2024-12-24 15:55:21 <NA>    
14 161XDFDX16 1998-05-21 00:00:00 Female 2024-10-02 06:06:55 <NA>    
15 197XDFDX53 2004-09-28 00:00:00 Female 2024-04-24 06:24:55 <NA>    
16 166XDFDX89 2008-02-21 00:00:00 Female 2024-09-11 23:07:36 <NA>    
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
#   result_detail <chr>, result_category <chr>
data_unique[!complete.cases(data_unique), ]
# A tibble: 16 × 9
   id_patient birthdate           sex    test_date           location
   <chr>      <dttm>              <chr>  <dttm>              <chr>   
 1 249XDFDX36 1937-09-21 00:00:00 Male   2024-05-03 17:53:30 <NA>    
 2 249XDFDX36 1937-09-21 00:00:00 Male   2024-05-03 17:53:30 <NA>    
 3 167XDFDX58 1937-12-12 00:00:00 Male   2024-04-13 13:56:36 <NA>    
 4 128XDFDX77 1938-10-27 00:00:00 Female 2025-01-08 14:46:25 <NA>    
 5 197XDFDX65 1940-01-07 00:00:00 Female 2024-08-22 19:27:36 <NA>    
 6 249XDFDX42 1943-08-01 00:00:00 Female 2024-08-12 13:40:21 <NA>    
 7 183XDFDX93 1953-11-04 00:00:00 Male   2024-04-08 09:23:12 <NA>    
 8 213XDFDX72 1954-05-17 00:00:00 Male   2024-08-01 22:41:19 <NA>    
 9 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>    
10 249XDFDX42 1967-02-11 00:00:00 Female 2024-12-05 00:06:54 <NA>    
11 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>    
12 249XDFDX41 1975-07-21 00:00:00 Female 2024-11-09 17:46:40 <NA>    
13 180XDFDX42 1995-05-02 00:00:00 Male   2024-12-24 15:55:21 <NA>    
14 161XDFDX16 1998-05-21 00:00:00 Female 2024-10-02 06:06:55 <NA>    
15 197XDFDX53 2004-09-28 00:00:00 Female 2024-04-24 06:24:55 <NA>    
16 166XDFDX89 2008-02-21 00:00:00 Female 2024-09-11 23:07:36 <NA>    
# ℹ 4 more variables: test_procedure_name <chr>, test_procedure_result <chr>,
#   result_detail <chr>, result_category <chr>
data_unique$location[is.na(data_unique$location)] <- "unknown"
sum(is.na(data_unique))
[1] 0

Data cleaning

unique(data_unique$sex)
[1] "Male"   "Female" "M"      "F"      "MALE"  
sum(data_unique$sex =="Female")
[1] 134053
sum(data_unique$sex =="Male")
[1] 145128
sum(data_unique$sex =="M")
[1] 1109
sum(data_unique$sex =="F")
[1] 3790
sum(data_unique$sex =="MALE")
[1] 319
data_unique$sex <- tolower(data_unique$sex)
unique(data_unique$sex)
[1] "male"   "female" "m"      "f"     
data_unique$sex[data_unique$sex =="m"] <- "male"
data_unique$sex[data_unique$sex =="f"] <- "female"
sum(data_unique$sex =="Female")
[1] 0
sum(data_unique$sex =="Male")
[1] 0
sum(data_unique$sex =="M")
[1] 0
sum(data_unique$sex =="F")
[1] 0
sum(data_unique$sex =="MALE")
[1] 0
sum(data_unique$sex =="male")
[1] 146556
sum(data_unique$sex =="female")
[1] 137843

Data transformation and filtering

First we will create a new variable age from birth date and date of test.

class(data_unique$birthdate)
[1] "POSIXct" "POSIXt" 
class(data_unique$test_date)
[1] "POSIXct" "POSIXt" 
# convert birthdate and test_date into date formate, defualt formate is YYYY-MM-DD, if your data is in different formate than specify formate using as.Date(birthdate, format = "%d-%m-%Y")
data_unique <- data_unique %>%
  mutate(
    birthdate = as.Date(birthdate),
    test_date = as.Date(test_date)
  )
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
data_unique <- data_unique %>%
  mutate(age = as.numeric(difftime(test_date, birthdate, units = "days")) / 365.25)
colnames(data_unique)
 [1] "id_patient"            "birthdate"             "sex"                  
 [4] "test_date"             "location"              "test_procedure_name"  
 [7] "test_procedure_result" "result_detail"         "result_category"      
[10] "age"                  
# Mean age for males
data_unique %>%
  filter(sex == "male") %>%
  summarise(mean_age = mean(age, na.rm = TRUE))
# A tibble: 1 × 1
  mean_age
     <dbl>
1     51.1
# Mean age for females
data_unique %>%
  filter(sex == "female") %>%
  summarise(mean_age = mean(age, na.rm = TRUE))
# A tibble: 1 × 1
  mean_age
     <dbl>
1     51.0
data_unique %>%
  group_by(sex) %>%
  summarise(mean_age = mean(age, na.rm = TRUE))
# A tibble: 2 × 2
  sex    mean_age
  <chr>     <dbl>
1 female     51.0
2 male       51.1
data_unique %>% 
  filter(test_procedure_name == "Hemoglobin") %>%
  mutate(test_procedure_result = as.numeric(test_procedure_result)) %>% 
  group_by(sex) %>% 
  summarise(mean_hemoglobin = mean(test_procedure_result))
# A tibble: 2 × 2
  sex    mean_hemoglobin
  <chr>            <dbl>
1 female            14.9
2 male              15.1
unique(data_unique$test_procedure_name)
[1] "Hemoglobin"            "Blood Culture"         "Histopathology Report"
[4] "Blood Glucose"        
# write cleaned data into new excel file 
# write_xlsx(data_unique,"cleaned_dummy_patient_data_R_workshop.xlsx")
data <- data %>%
  mutate(
    result_category = case_when(
      test_procedure_result < 70 &
        test_procedure_name == "Blood Glucose" ~ "hypoglycemia",
      
      test_procedure_result > 125 &
        test_procedure_name == "Blood Glucose" ~ "hyperglycemia",
      
      test_procedure_result < 12 &
        test_procedure_name == "Hemoglobin" ~ "anemia",
      
      test_procedure_result > 18 &
        test_procedure_name == "Hemoglobin" ~ "polycythemia",
      
      TRUE ~ "normal"
    )
  )