This is a walkthrough of survival analysis for Control vs. CCNE1-high ovarian cancer patients. To begin, import your data files. The data I am using is taken from cBioPortal. Specifically, once all the data is downloaded, the RNA-seq data is taken from the file titled “data_mrna_seq_v2_rsem.txt”.
rna_seq <- read.table("/Users/richardfang/Computer Science/R-practice/CCNE1-project/data_mrna_seq_v2_rsem.txt",
header = TRUE)
head(rna_seq[, 1:5], 10)
## Hugo_Symbol Entrez_Gene_Id TCGA.04.1348.01 TCGA.04.1357.01 TCGA.04.1362.01
## 1 LOC100130426 100130426 0.0000 0.0000 0.6619
## 2 UBE2Q2P3 100133144 27.3245 21.9661 27.2611
## 3 UBE2Q2P3 100134869 45.8134 36.6276 53.1619
## 4 HMGB1P1 10357 24.0979 9.1146 32.1030
## 5 TIMM23 10431 1509.2767 696.6146 1056.4202
## 6 MOXD2 136542 0.0000 0.0000 0.0000
## 7 LOC155060 155060 200.5916 220.7031 340.8875
## 8 RNU12-2P 26823 0.5378 0.0000 0.3310
## 9 SSX9 280660 0.0000 0.0000 0.0000
## 10 LOC317712 317712 0.0000 0.0000 0.0000
Next, to select our patient populations and stratify based on CCNE1-expression, we will initialize dataframes for our Control and CCNE1-high populations. Here we will also calculate the quartiles based on CCNE1-expression.
control <- rna_seq[, c(1)]
ccne1_high <- rna_seq[, c(1)]
rownames(rna_seq) <- make.names(rna_seq[, 1], unique = TRUE)
rna_seq <- rna_seq[-c(1:2)]
quartiles <- quantile(as.numeric(rna_seq["CCNE1", ]), probs = c(0.25,
0.75))
quartiles[1]
quartiles[2]
## 25%
## 381.7844
## 75%
## 1197.788
With the CCNE1-expression for our quartiles calculated, we will next loop through the RNA-seq data and add the patient samples to the appropriate dataframes. Additionally, we will clean up the dataframes and make the row names the genes.
for (i in 1:ncol(rna_seq)) {
if (rna_seq["CCNE1", i] <= quartiles[1]) {
entry <- data.frame(rna_seq[, i])
colnames(entry) <- c(colnames(rna_seq)[i])
control <- cbind(control, entry)
} else if (rna_seq["CCNE1", i] >= quartiles[2]) {
entry <- data.frame(rna_seq[, i])
colnames(entry) <- c(colnames(rna_seq)[i])
ccne1_high <- cbind(ccne1_high, entry)
}
}
rownames(control) <- make.names(control[, 1], unique = TRUE)
control <- control[-c(1)]
rownames(ccne1_high) <- make.names(ccne1_high[, 1], unique = TRUE)
ccne1_high <- ccne1_high[-c(1)]
head(control[, 1:4], 10)
## TCGA.04.1514.01 TCGA.04.1519.01 TCGA.09.0364.01 TCGA.09.1662.01
## LOC100130426 0.0000 0.0000 0.0000 0.2693
## UBE2Q2P3 3.5924 23.9875 25.9322 39.7792
## UBE2Q2P3.1 11.6638 27.1918 49.5021 80.6087
## HMGB1P1 28.7075 38.6293 65.7950 46.3076
## TIMM23 1059.8733 1179.3502 557.4006 702.1277
## MOXD2 0.0000 0.0000 0.0000 0.0000
## LOC155060 357.8008 80.9969 529.8468 159.9785
## RNU12.2P 0.5757 0.8901 1.3551 0.2693
## SSX9 0.0000 0.0000 0.0000 0.0000
## LOC317712 0.0000 0.0000 0.0000 0.0000
head(ccne1_high[, 1:4], 10)
## TCGA.04.1348.01 TCGA.04.1364.01 TCGA.09.1659.01 TCGA.09.1667.01
## LOC100130426 0.0000 0.0000 0.0000 0.4254
## UBE2Q2P3 27.3245 18.6062 20.4247 7.3979
## UBE2Q2P3.1 45.8134 14.4677 36.8670 12.1710
## HMGB1P1 24.0979 30.2498 20.7732 27.6517
## TIMM23 1509.2767 1372.5640 1416.6667 1155.8419
## MOXD2 0.0000 0.0000 0.0000 0.0000
## LOC155060 200.5916 306.5690 212.7404 237.3794
## RNU12.2P 0.5378 0.0000 3.2051 0.4254
## SSX9 0.0000 0.0000 0.0000 0.0000
## LOC317712 0.0000 0.0000 0.0000 0.0000
To confirm that we have the correct number of patients in each set, let’s check the number of columns in each dataframe.
ncol(control)
## [1] 77
ncol(ccne1_high)
## [1] 77
With our new dataframes, we want to get all the patient ID’s from each set of patients. We also need to clean up the patient ID’s and format them the same way as our clinical data that we will import later on.
control_names <- colnames(control)
ccne1_names <- colnames(ccne1_high)
for (i in 1:length(control_names)) {
control_names[i] <- str_trunc(toString(gsub(".", "-", control_names[i],
fixed = TRUE)), 12, ellipsis = "")
}
for (i in 1:length(ccne1_names)) {
ccne1_names[i] <- str_trunc(toString(gsub(".", "-", ccne1_names[i],
fixed = TRUE)), 12, ellipsis = "")
}
print(control_names)
## [1] "TCGA-04-1514" "TCGA-04-1519" "TCGA-09-0364" "TCGA-09-1662" "TCGA-09-1669"
## [6] "TCGA-09-2045" "TCGA-13-0800" "TCGA-13-1403" "TCGA-13-1498" "TCGA-13-1505"
## [11] "TCGA-20-1687" "TCGA-23-2081" "TCGA-24-1425" "TCGA-24-1427" "TCGA-24-1430"
## [16] "TCGA-24-1463" "TCGA-24-1474" "TCGA-24-1545" "TCGA-24-1546" "TCGA-24-1553"
## [21] "TCGA-24-1563" "TCGA-24-1564" "TCGA-24-1567" "TCGA-24-1604" "TCGA-24-1846"
## [26] "TCGA-24-2024" "TCGA-24-2038" "TCGA-24-2289" "TCGA-24-2293" "TCGA-25-1313"
## [31] "TCGA-25-1315" "TCGA-25-1316" "TCGA-25-1328" "TCGA-25-1625" "TCGA-25-1627"
## [36] "TCGA-25-1632" "TCGA-25-1633" "TCGA-25-1634" "TCGA-25-1870" "TCGA-25-1871"
## [41] "TCGA-25-2392" "TCGA-29-1690" "TCGA-29-1696" "TCGA-29-1702" "TCGA-29-1705"
## [46] "TCGA-29-1776" "TCGA-29-1778" "TCGA-29-2414" "TCGA-29-2414" "TCGA-29-2425"
## [51] "TCGA-29-A5NZ" "TCGA-30-1857" "TCGA-30-1861" "TCGA-30-1862" "TCGA-31-1956"
## [56] "TCGA-31-1959" "TCGA-36-1569" "TCGA-36-1571" "TCGA-36-1577" "TCGA-36-1578"
## [61] "TCGA-36-1580" "TCGA-59-2352" "TCGA-59-2363" "TCGA-61-1721" "TCGA-61-1725"
## [66] "TCGA-61-1900" "TCGA-61-1907" "TCGA-61-1910" "TCGA-61-2009" "TCGA-61-2016"
## [71] "TCGA-61-2088" "TCGA-61-2095" "TCGA-61-2097" "TCGA-61-2109" "TCGA-OY-A56P"
## [76] "TCGA-OY-A56Q" "TCGA-WR-A838"
print(ccne1_names)
## [1] "TCGA-04-1348" "TCGA-04-1364" "TCGA-09-1659" "TCGA-09-1667" "TCGA-09-1673"
## [6] "TCGA-09-2054" "TCGA-10-0936" "TCGA-13-0730" "TCGA-13-0801" "TCGA-13-0893"
## [11] "TCGA-13-0920" "TCGA-13-1410" "TCGA-13-1506" "TCGA-13-1507" "TCGA-13-2060"
## [16] "TCGA-20-1683" "TCGA-20-1684" "TCGA-23-1109" "TCGA-23-1120" "TCGA-23-1122"
## [21] "TCGA-23-1123" "TCGA-23-2078" "TCGA-23-2084" "TCGA-24-1417" "TCGA-24-1418"
## [26] "TCGA-24-1424" "TCGA-24-1469" "TCGA-24-1544" "TCGA-24-1548" "TCGA-24-1552"
## [31] "TCGA-24-1558" "TCGA-24-1842" "TCGA-24-1843" "TCGA-24-1850" "TCGA-24-1928"
## [36] "TCGA-24-2019" "TCGA-24-2020" "TCGA-24-2023" "TCGA-24-2026" "TCGA-24-2261"
## [41] "TCGA-24-2267" "TCGA-24-2288" "TCGA-24-2298" "TCGA-25-1312" "TCGA-25-1320"
## [46] "TCGA-25-1322" "TCGA-25-1329" "TCGA-25-1631" "TCGA-25-2396" "TCGA-29-1695"
## [51] "TCGA-29-1698" "TCGA-29-1699" "TCGA-29-1701" "TCGA-29-1703" "TCGA-29-1710"
## [56] "TCGA-29-1761" "TCGA-29-1785" "TCGA-29-2427" "TCGA-29-2428" "TCGA-30-1855"
## [61] "TCGA-31-1953" "TCGA-36-1576" "TCGA-36-1581" "TCGA-57-1583" "TCGA-57-1993"
## [66] "TCGA-57-1994" "TCGA-59-A5PD" "TCGA-61-1728" "TCGA-61-1733" "TCGA-61-1736"
## [71] "TCGA-61-1738" "TCGA-61-1917" "TCGA-61-1919" "TCGA-61-1998" "TCGA-61-2012"
## [76] "TCGA-61-2102" "TCGA-61-2113"
Next we will import our clinical data and clean it up by extracting only the data we need. To conduct our survival analysis, we will need to extract the patient ID, the patient status, and the patient survival time. Again, the data we are using is from cBioPortal, and the specific file is titled “data_clinical_patient.txt”, but I have done some manual cleaning by removing the first few rows from the file.
1 = deceased
0 = living
patient_data <- read.csv("/Users/richardfang/Computer Science/R-practice/CCNE1-project/data_clinical_patient_CLEANED.csv",
fill = TRUE, header = TRUE)
patient_data$OS_STATUS <- as.numeric(str_trunc(patient_data$OS_STATUS,
1, ellipsis = ""))
survival_data <- patient_data[, c("PATIENT_ID", "OS_STATUS",
"OS_MONTHS")]
head(survival_data, 20)
## PATIENT_ID OS_STATUS OS_MONTHS
## 1 TCGA-04-1331 1 43.89
## 2 TCGA-04-1332 1 40.97
## 3 TCGA-04-1335 1 1.81
## 4 TCGA-04-1336 0 49.11
## 5 TCGA-04-1337 1 2
## 6 TCGA-04-1338 0 46.58
## 7 TCGA-04-1341 0 1.08
## 8 TCGA-04-1342 1 18.5
## 9 TCGA-04-1343 1 11.86
## 10 TCGA-04-1346 0 65.47
## 11 TCGA-04-1347 0 63.04
## 12 TCGA-04-1348 1 48.72
## 13 TCGA-04-1349 1 21.55
## 14 TCGA-04-1350 1 63.93
## 15 TCGA-04-1351 0 65.41
## 16 TCGA-04-1353 0 0.53
## 17 TCGA-04-1356 1 49.24
## 18 TCGA-04-1357 0 [Not Available]
## 19 TCGA-04-1360 0 [Not Available]
## 20 TCGA-04-1361 0 32.49
As we can see in the dataframe above, there are some patients without data available. Because this will not be useful, we need to clean this data further by looping through and removing any patient without data.
remove_rows <- c()
for (i in 1:nrow(survival_data)) {
if (survival_data[i, "OS_MONTHS"] == "[Not Available]") {
remove_rows <- c(remove_rows, i)
}
}
survival_data <- survival_data[-remove_rows, ]
head(survival_data, 20)
## PATIENT_ID OS_STATUS OS_MONTHS
## 1 TCGA-04-1331 1 43.89
## 2 TCGA-04-1332 1 40.97
## 3 TCGA-04-1335 1 1.81
## 4 TCGA-04-1336 0 49.11
## 5 TCGA-04-1337 1 2
## 6 TCGA-04-1338 0 46.58
## 7 TCGA-04-1341 0 1.08
## 8 TCGA-04-1342 1 18.5
## 9 TCGA-04-1343 1 11.86
## 10 TCGA-04-1346 0 65.47
## 11 TCGA-04-1347 0 63.04
## 12 TCGA-04-1348 1 48.72
## 13 TCGA-04-1349 1 21.55
## 14 TCGA-04-1350 1 63.93
## 15 TCGA-04-1351 0 65.41
## 16 TCGA-04-1353 0 0.53
## 17 TCGA-04-1356 1 49.24
## 20 TCGA-04-1361 0 32.49
## 21 TCGA-04-1362 1 44.28
## 22 TCGA-04-1364 1 33.64
Now with our clinical data cleaned, we would like to select specific patients based on CCNE1-expression. Remember, earlier we created vectors with our patients of interest, so now we will use those vectors to select the clinical data of our patients.
control_survival <- survival_data[survival_data$PATIENT_ID %in%
control_names, ]
ccne1_survival <- survival_data[survival_data$PATIENT_ID %in%
ccne1_names, ]
length(control_names)
## [1] 77
length(ccne1_names)
## [1] 77
nrow(control_survival)
## [1] 76
nrow(ccne1_survival)
## [1] 77
With our patient clinical samples defined, we need to remove any outliers that may skew results. To do this, we will employ the formula \(Outlier > Mean + (3 * Standard Deviation)\) or \(Outlier < Mean - (3 * Standard Deviation)\).
ccne1_outliers <- c()
for (i in 1:nrow(ccne1_survival)) {
if (as.numeric(ccne1_survival$OS_MONTHS[i]) > mean(as.numeric(ccne1_survival$OS_MONTHS) +
(sd(as.numeric(ccne1_survival$OS_MONTHS)) * 3)) | as.numeric(ccne1_survival$OS_MONTHS[i]) <
mean(as.numeric(ccne1_survival$OS_MONTHS)) - (sd(as.numeric(ccne1_survival$OS_MONTHS)) *
3)) {
ccne1_outliers <- c(ccne1_outliers, i)
}
}
if (length(ccne1_outliers) != 0) {
ccne1_survival <- ccne1_survival[-ccne1_outliers, ]
}
control_outliers <- c()
for (i in 1:nrow(control_survival)) {
if (as.numeric(control_survival$OS_MONTHS[i]) > mean(as.numeric(control_survival$OS_MONTHS) +
(sd(as.numeric(control_survival$OS_MONTHS)) * 3)) | as.numeric(control_survival$OS_MONTHS[i]) <
mean(as.numeric(control_survival$OS_MONTHS)) - (sd(as.numeric(control_survival$OS_MONTHS)) *
3)) {
control_outliers <- c(control_outliers, i)
}
}
if (length(control_outliers) != 0) {
control_survival <- control_survival[-control_outliers, ]
}
nrow(control_survival)
## [1] 76
nrow(ccne1_survival)
## [1] 76
Now that our outliers are removed, we can begin to plot our survival curves. We will plot the data censoring based on status. Only patients who are deceased will be included on the plot.
control_survival$OS_MONTHS <- as.numeric(as.character(control_survival$OS_MONTHS))
control_survival$OS_STATUS <- as.numeric(as.character(control_survival$OS_STATUS))
ccne1_survival$OS_MONTHS <- as.numeric(as.character(ccne1_survival$OS_MONTHS))
ccne1_survival$OS_STATUS <- as.numeric(as.character(ccne1_survival$OS_STATUS))
sf.control <- survfit2(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = control_survival)
sf.control %>%
ggsurvfit() + add_confidence_interval() + add_risktable()
sf.control %>%
tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic | Median survival (95% CI) |
---|---|
Overall | 48 (35, 62) |
sf.ccne1 <- survfit2(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = ccne1_survival)
sf.ccne1 %>%
ggsurvfit() + add_confidence_interval() + add_risktable()
sf.ccne1 %>%
tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic | Median survival (95% CI) |
---|---|
Overall | 38 (34, 45) |
We have both of our survival curves, so now let’s plot on the same graph now.
all_surv <- rbind((cbind(control_survival, type = "control")),
(cbind(ccne1_survival, type = "ccne1")))
sf.all <- survfit2(Surv(as.numeric(OS_MONTHS), OS_STATUS) ~ type,
data = all_surv)
sf.all %>%
ggsurvfit() + add_confidence_interval() + add_risktable()
sf.all %>%
tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic | Median survival (95% CI) |
---|---|
type | |
ccne1 | 38 (34, 45) |
control | 48 (35, 62) |
remove_control <- which(grepl("0", as.character(control_survival$OS_STATUS)))
control_survival <- control_survival[-remove_control, ]
remove_ccne1 <- which(grepl("0", as.character(ccne1_survival$OS_STATUS)))
ccne1_survival <- ccne1_survival[-remove_ccne1, ]
surv_no_living <- rbind((cbind(control_survival, type = "control")),
(cbind(ccne1_survival, type = "ccne1")))
sf.all <- survfit2(Surv(as.numeric(OS_MONTHS), OS_STATUS) ~ type,
data = surv_no_living)
sf.all %>%
ggsurvfit() + add_confidence_interval() + add_risktable()
sf.all %>%
tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic | Median survival (95% CI) |
---|---|
type | |
ccne1 | 34 (21, 38) |
control | 35 (27, 57) |
As far as I understand, “censoring” for survival analysis accounts for the living patients, so excluding them in the analysis is incorrect
Now that we have our KM plots, let’s take a look at gene expression for specified genes between control and CCNE1-high patients. We have already created these dataframes earlier, so we will continue to work with them.
## [1] "CCNE1"
## [1] "Control mean: "
## [1] 220.5552
## [1] "CCNE1-high mean: "
## [1] 2626.943
## [1] "GPX4"
## [1] "Control mean: "
## [1] 3751.398
## [1] "CCNE1-high mean: "
## [1] 5260.06
## [1] "FTH1"
## [1] "Control mean: "
## [1] 55286.35
## [1] "CCNE1-high mean: "
## [1] 70301.04
## [1] "FTL"
## [1] "Control mean: "
## [1] 39652.52
## [1] "CCNE1-high mean: "
## [1] 58126.86
## [1] "TFRC"
## [1] "Control mean: "
## [1] 3350.357
## [1] "CCNE1-high mean: "
## [1] 4315.588
## [1] "SLC7A11"
## [1] "Control mean: "
## [1] 373.0885
## [1] "CCNE1-high mean: "
## [1] 338.5149
## [1] "SLC11A2"
## [1] "Control mean: "
## [1] 2261.683
## [1] "CCNE1-high mean: "
## [1] 1963.855