Survival Analysis

This is a walkthrough of survival analysis for Control vs. CCNE1-high ovarian cancer patients. To begin, import your data files. The data I am using is taken from cBioPortal. Specifically, once all the data is downloaded, the RNA-seq data is taken from the file titled “data_mrna_seq_v2_rsem.txt”.

rna_seq <- read.table("/Users/richardfang/Computer Science/R-practice/CCNE1-project/data_mrna_seq_v2_rsem.txt",
    header = TRUE)

head(rna_seq[, 1:5], 10)
##     Hugo_Symbol Entrez_Gene_Id TCGA.04.1348.01 TCGA.04.1357.01 TCGA.04.1362.01
## 1  LOC100130426      100130426          0.0000          0.0000          0.6619
## 2      UBE2Q2P3      100133144         27.3245         21.9661         27.2611
## 3      UBE2Q2P3      100134869         45.8134         36.6276         53.1619
## 4       HMGB1P1          10357         24.0979          9.1146         32.1030
## 5        TIMM23          10431       1509.2767        696.6146       1056.4202
## 6         MOXD2         136542          0.0000          0.0000          0.0000
## 7     LOC155060         155060        200.5916        220.7031        340.8875
## 8      RNU12-2P          26823          0.5378          0.0000          0.3310
## 9          SSX9         280660          0.0000          0.0000          0.0000
## 10    LOC317712         317712          0.0000          0.0000          0.0000

Stratifying Patients Based on CCNE1-expression

Next, to select our patient populations and stratify based on CCNE1-expression, we will initialize dataframes for our Control and CCNE1-high populations. Here we will also calculate the quartiles based on CCNE1-expression.

control <- rna_seq[, c(1)]
ccne1_high <- rna_seq[, c(1)]

rownames(rna_seq) <- make.names(rna_seq[, 1], unique = TRUE)
rna_seq <- rna_seq[-c(1:2)]


quartiles <- quantile(as.numeric(rna_seq["CCNE1", ]), probs = c(0.25,
    0.75))

quartiles[1]
quartiles[2]
##      25% 
## 381.7844 
##      75% 
## 1197.788

With the CCNE1-expression for our quartiles calculated, we will next loop through the RNA-seq data and add the patient samples to the appropriate dataframes. Additionally, we will clean up the dataframes and make the row names the genes.

for (i in 1:ncol(rna_seq)) {
    if (rna_seq["CCNE1", i] <= quartiles[1]) {
        entry <- data.frame(rna_seq[, i])
        colnames(entry) <- c(colnames(rna_seq)[i])
        control <- cbind(control, entry)
    } else if (rna_seq["CCNE1", i] >= quartiles[2]) {
        entry <- data.frame(rna_seq[, i])
        colnames(entry) <- c(colnames(rna_seq)[i])
        ccne1_high <- cbind(ccne1_high, entry)
    }
}

rownames(control) <- make.names(control[, 1], unique = TRUE)
control <- control[-c(1)]
rownames(ccne1_high) <- make.names(ccne1_high[, 1], unique = TRUE)
ccne1_high <- ccne1_high[-c(1)]

head(control[, 1:4], 10)
##              TCGA.04.1514.01 TCGA.04.1519.01 TCGA.09.0364.01 TCGA.09.1662.01
## LOC100130426          0.0000          0.0000          0.0000          0.2693
## UBE2Q2P3              3.5924         23.9875         25.9322         39.7792
## UBE2Q2P3.1           11.6638         27.1918         49.5021         80.6087
## HMGB1P1              28.7075         38.6293         65.7950         46.3076
## TIMM23             1059.8733       1179.3502        557.4006        702.1277
## MOXD2                 0.0000          0.0000          0.0000          0.0000
## LOC155060           357.8008         80.9969        529.8468        159.9785
## RNU12.2P              0.5757          0.8901          1.3551          0.2693
## SSX9                  0.0000          0.0000          0.0000          0.0000
## LOC317712             0.0000          0.0000          0.0000          0.0000
head(ccne1_high[, 1:4], 10)
##              TCGA.04.1348.01 TCGA.04.1364.01 TCGA.09.1659.01 TCGA.09.1667.01
## LOC100130426          0.0000          0.0000          0.0000          0.4254
## UBE2Q2P3             27.3245         18.6062         20.4247          7.3979
## UBE2Q2P3.1           45.8134         14.4677         36.8670         12.1710
## HMGB1P1              24.0979         30.2498         20.7732         27.6517
## TIMM23             1509.2767       1372.5640       1416.6667       1155.8419
## MOXD2                 0.0000          0.0000          0.0000          0.0000
## LOC155060           200.5916        306.5690        212.7404        237.3794
## RNU12.2P              0.5378          0.0000          3.2051          0.4254
## SSX9                  0.0000          0.0000          0.0000          0.0000
## LOC317712             0.0000          0.0000          0.0000          0.0000

To confirm that we have the correct number of patients in each set, let’s check the number of columns in each dataframe.

ncol(control)
## [1] 77
ncol(ccne1_high)
## [1] 77

Clean Up Data

With our new dataframes, we want to get all the patient ID’s from each set of patients. We also need to clean up the patient ID’s and format them the same way as our clinical data that we will import later on.

control_names <- colnames(control)
ccne1_names <- colnames(ccne1_high)

for (i in 1:length(control_names)) {
    control_names[i] <- str_trunc(toString(gsub(".", "-", control_names[i],
        fixed = TRUE)), 12, ellipsis = "")
}

for (i in 1:length(ccne1_names)) {
    ccne1_names[i] <- str_trunc(toString(gsub(".", "-", ccne1_names[i],
        fixed = TRUE)), 12, ellipsis = "")
}

print(control_names)
##  [1] "TCGA-04-1514" "TCGA-04-1519" "TCGA-09-0364" "TCGA-09-1662" "TCGA-09-1669"
##  [6] "TCGA-09-2045" "TCGA-13-0800" "TCGA-13-1403" "TCGA-13-1498" "TCGA-13-1505"
## [11] "TCGA-20-1687" "TCGA-23-2081" "TCGA-24-1425" "TCGA-24-1427" "TCGA-24-1430"
## [16] "TCGA-24-1463" "TCGA-24-1474" "TCGA-24-1545" "TCGA-24-1546" "TCGA-24-1553"
## [21] "TCGA-24-1563" "TCGA-24-1564" "TCGA-24-1567" "TCGA-24-1604" "TCGA-24-1846"
## [26] "TCGA-24-2024" "TCGA-24-2038" "TCGA-24-2289" "TCGA-24-2293" "TCGA-25-1313"
## [31] "TCGA-25-1315" "TCGA-25-1316" "TCGA-25-1328" "TCGA-25-1625" "TCGA-25-1627"
## [36] "TCGA-25-1632" "TCGA-25-1633" "TCGA-25-1634" "TCGA-25-1870" "TCGA-25-1871"
## [41] "TCGA-25-2392" "TCGA-29-1690" "TCGA-29-1696" "TCGA-29-1702" "TCGA-29-1705"
## [46] "TCGA-29-1776" "TCGA-29-1778" "TCGA-29-2414" "TCGA-29-2414" "TCGA-29-2425"
## [51] "TCGA-29-A5NZ" "TCGA-30-1857" "TCGA-30-1861" "TCGA-30-1862" "TCGA-31-1956"
## [56] "TCGA-31-1959" "TCGA-36-1569" "TCGA-36-1571" "TCGA-36-1577" "TCGA-36-1578"
## [61] "TCGA-36-1580" "TCGA-59-2352" "TCGA-59-2363" "TCGA-61-1721" "TCGA-61-1725"
## [66] "TCGA-61-1900" "TCGA-61-1907" "TCGA-61-1910" "TCGA-61-2009" "TCGA-61-2016"
## [71] "TCGA-61-2088" "TCGA-61-2095" "TCGA-61-2097" "TCGA-61-2109" "TCGA-OY-A56P"
## [76] "TCGA-OY-A56Q" "TCGA-WR-A838"
print(ccne1_names)
##  [1] "TCGA-04-1348" "TCGA-04-1364" "TCGA-09-1659" "TCGA-09-1667" "TCGA-09-1673"
##  [6] "TCGA-09-2054" "TCGA-10-0936" "TCGA-13-0730" "TCGA-13-0801" "TCGA-13-0893"
## [11] "TCGA-13-0920" "TCGA-13-1410" "TCGA-13-1506" "TCGA-13-1507" "TCGA-13-2060"
## [16] "TCGA-20-1683" "TCGA-20-1684" "TCGA-23-1109" "TCGA-23-1120" "TCGA-23-1122"
## [21] "TCGA-23-1123" "TCGA-23-2078" "TCGA-23-2084" "TCGA-24-1417" "TCGA-24-1418"
## [26] "TCGA-24-1424" "TCGA-24-1469" "TCGA-24-1544" "TCGA-24-1548" "TCGA-24-1552"
## [31] "TCGA-24-1558" "TCGA-24-1842" "TCGA-24-1843" "TCGA-24-1850" "TCGA-24-1928"
## [36] "TCGA-24-2019" "TCGA-24-2020" "TCGA-24-2023" "TCGA-24-2026" "TCGA-24-2261"
## [41] "TCGA-24-2267" "TCGA-24-2288" "TCGA-24-2298" "TCGA-25-1312" "TCGA-25-1320"
## [46] "TCGA-25-1322" "TCGA-25-1329" "TCGA-25-1631" "TCGA-25-2396" "TCGA-29-1695"
## [51] "TCGA-29-1698" "TCGA-29-1699" "TCGA-29-1701" "TCGA-29-1703" "TCGA-29-1710"
## [56] "TCGA-29-1761" "TCGA-29-1785" "TCGA-29-2427" "TCGA-29-2428" "TCGA-30-1855"
## [61] "TCGA-31-1953" "TCGA-36-1576" "TCGA-36-1581" "TCGA-57-1583" "TCGA-57-1993"
## [66] "TCGA-57-1994" "TCGA-59-A5PD" "TCGA-61-1728" "TCGA-61-1733" "TCGA-61-1736"
## [71] "TCGA-61-1738" "TCGA-61-1917" "TCGA-61-1919" "TCGA-61-1998" "TCGA-61-2012"
## [76] "TCGA-61-2102" "TCGA-61-2113"

Import Clinical Data and Cleanup

Next we will import our clinical data and clean it up by extracting only the data we need. To conduct our survival analysis, we will need to extract the patient ID, the patient status, and the patient survival time. Again, the data we are using is from cBioPortal, and the specific file is titled “data_clinical_patient.txt”, but I have done some manual cleaning by removing the first few rows from the file.

1 = deceased
0 = living

patient_data <- read.csv("/Users/richardfang/Computer Science/R-practice/CCNE1-project/data_clinical_patient_CLEANED.csv",
    fill = TRUE, header = TRUE)

patient_data$OS_STATUS <- as.numeric(str_trunc(patient_data$OS_STATUS,
    1, ellipsis = ""))
survival_data <- patient_data[, c("PATIENT_ID", "OS_STATUS",
    "OS_MONTHS")]

head(survival_data, 20)
##      PATIENT_ID OS_STATUS       OS_MONTHS
## 1  TCGA-04-1331         1           43.89
## 2  TCGA-04-1332         1           40.97
## 3  TCGA-04-1335         1            1.81
## 4  TCGA-04-1336         0           49.11
## 5  TCGA-04-1337         1               2
## 6  TCGA-04-1338         0           46.58
## 7  TCGA-04-1341         0            1.08
## 8  TCGA-04-1342         1            18.5
## 9  TCGA-04-1343         1           11.86
## 10 TCGA-04-1346         0           65.47
## 11 TCGA-04-1347         0           63.04
## 12 TCGA-04-1348         1           48.72
## 13 TCGA-04-1349         1           21.55
## 14 TCGA-04-1350         1           63.93
## 15 TCGA-04-1351         0           65.41
## 16 TCGA-04-1353         0            0.53
## 17 TCGA-04-1356         1           49.24
## 18 TCGA-04-1357         0 [Not Available]
## 19 TCGA-04-1360         0 [Not Available]
## 20 TCGA-04-1361         0           32.49

As we can see in the dataframe above, there are some patients without data available. Because this will not be useful, we need to clean this data further by looping through and removing any patient without data.

remove_rows <- c()
for (i in 1:nrow(survival_data)) {
    if (survival_data[i, "OS_MONTHS"] == "[Not Available]") {
        remove_rows <- c(remove_rows, i)
    }
}
survival_data <- survival_data[-remove_rows, ]

head(survival_data, 20)
##      PATIENT_ID OS_STATUS OS_MONTHS
## 1  TCGA-04-1331         1     43.89
## 2  TCGA-04-1332         1     40.97
## 3  TCGA-04-1335         1      1.81
## 4  TCGA-04-1336         0     49.11
## 5  TCGA-04-1337         1         2
## 6  TCGA-04-1338         0     46.58
## 7  TCGA-04-1341         0      1.08
## 8  TCGA-04-1342         1      18.5
## 9  TCGA-04-1343         1     11.86
## 10 TCGA-04-1346         0     65.47
## 11 TCGA-04-1347         0     63.04
## 12 TCGA-04-1348         1     48.72
## 13 TCGA-04-1349         1     21.55
## 14 TCGA-04-1350         1     63.93
## 15 TCGA-04-1351         0     65.41
## 16 TCGA-04-1353         0      0.53
## 17 TCGA-04-1356         1     49.24
## 20 TCGA-04-1361         0     32.49
## 21 TCGA-04-1362         1     44.28
## 22 TCGA-04-1364         1     33.64

Selecting Clinical Data

Now with our clinical data cleaned, we would like to select specific patients based on CCNE1-expression. Remember, earlier we created vectors with our patients of interest, so now we will use those vectors to select the clinical data of our patients.

control_survival <- survival_data[survival_data$PATIENT_ID %in%
    control_names, ]
ccne1_survival <- survival_data[survival_data$PATIENT_ID %in%
    ccne1_names, ]

length(control_names)
## [1] 77
length(ccne1_names)
## [1] 77

nrow(control_survival)
## [1] 76
nrow(ccne1_survival)
## [1] 77

Removing Outliers

With our patient clinical samples defined, we need to remove any outliers that may skew results. To do this, we will employ the formula \(Outlier > Mean + (3 * Standard Deviation)\) or \(Outlier < Mean - (3 * Standard Deviation)\).

ccne1_outliers <- c()
for (i in 1:nrow(ccne1_survival)) {
    if (as.numeric(ccne1_survival$OS_MONTHS[i]) > mean(as.numeric(ccne1_survival$OS_MONTHS) +
        (sd(as.numeric(ccne1_survival$OS_MONTHS)) * 3)) | as.numeric(ccne1_survival$OS_MONTHS[i]) <
        mean(as.numeric(ccne1_survival$OS_MONTHS)) - (sd(as.numeric(ccne1_survival$OS_MONTHS)) *
            3)) {
        ccne1_outliers <- c(ccne1_outliers, i)
    }
}
if (length(ccne1_outliers) != 0) {
    ccne1_survival <- ccne1_survival[-ccne1_outliers, ]
}

control_outliers <- c()
for (i in 1:nrow(control_survival)) {
    if (as.numeric(control_survival$OS_MONTHS[i]) > mean(as.numeric(control_survival$OS_MONTHS) +
        (sd(as.numeric(control_survival$OS_MONTHS)) * 3)) | as.numeric(control_survival$OS_MONTHS[i]) <
        mean(as.numeric(control_survival$OS_MONTHS)) - (sd(as.numeric(control_survival$OS_MONTHS)) *
            3)) {
        control_outliers <- c(control_outliers, i)
    }
}
if (length(control_outliers) != 0) {
    control_survival <- control_survival[-control_outliers, ]
}

nrow(control_survival)
## [1] 76
nrow(ccne1_survival)
## [1] 76

Plotting the Survival Curves

Now that our outliers are removed, we can begin to plot our survival curves. We will plot the data censoring based on status. Only patients who are deceased will be included on the plot.

control_survival$OS_MONTHS <- as.numeric(as.character(control_survival$OS_MONTHS))
control_survival$OS_STATUS <- as.numeric(as.character(control_survival$OS_STATUS))
ccne1_survival$OS_MONTHS <- as.numeric(as.character(ccne1_survival$OS_MONTHS))
ccne1_survival$OS_STATUS <- as.numeric(as.character(ccne1_survival$OS_STATUS))

sf.control <- survfit2(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = control_survival)
sf.control %>%
    ggsurvfit() + add_confidence_interval() + add_risktable()

sf.control %>%
    tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic Median survival (95% CI)
Overall 48 (35, 62)

sf.ccne1 <- survfit2(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = ccne1_survival)
sf.ccne1 %>%
    ggsurvfit() + add_confidence_interval() + add_risktable()

sf.ccne1 %>%
    tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic Median survival (95% CI)
Overall 38 (34, 45)

We have both of our survival curves, so now let’s plot on the same graph now.

all_surv <- rbind((cbind(control_survival, type = "control")),
    (cbind(ccne1_survival, type = "ccne1")))

sf.all <- survfit2(Surv(as.numeric(OS_MONTHS), OS_STATUS) ~ type,
    data = all_surv)
sf.all %>%
    ggsurvfit() + add_confidence_interval() + add_risktable()

sf.all %>%
    tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic Median survival (95% CI)
type
    ccne1 38 (34, 45)
    control 48 (35, 62)
remove_control <- which(grepl("0", as.character(control_survival$OS_STATUS)))
control_survival <- control_survival[-remove_control, ]
remove_ccne1 <- which(grepl("0", as.character(ccne1_survival$OS_STATUS)))
ccne1_survival <- ccne1_survival[-remove_ccne1, ]

surv_no_living <- rbind((cbind(control_survival, type = "control")),
    (cbind(ccne1_survival, type = "ccne1")))

sf.all <- survfit2(Surv(as.numeric(OS_MONTHS), OS_STATUS) ~ type,
    data = surv_no_living)
sf.all %>%
    ggsurvfit() + add_confidence_interval() + add_risktable()

sf.all %>%
    tbl_survfit(probs = 0.5, label_header = "**Median survival (95% CI)**")
Characteristic Median survival (95% CI)
type
    ccne1 34 (21, 38)
    control 35 (27, 57)

As far as I understand, “censoring” for survival analysis accounts for the living patients, so excluding them in the analysis is incorrect

Violin Plots of Gene Expression in Control vs. CCNE1-high Patients

Now that we have our KM plots, let’s take a look at gene expression for specified genes between control and CCNE1-high patients. We have already created these dataframes earlier, so we will continue to work with them.

## [1] "CCNE1"
## [1] "Control mean: "
## [1] 220.5552
## [1] "CCNE1-high mean: "
## [1] 2626.943

## [1] "GPX4"
## [1] "Control mean: "
## [1] 3751.398
## [1] "CCNE1-high mean: "
## [1] 5260.06

## [1] "FTH1"
## [1] "Control mean: "
## [1] 55286.35
## [1] "CCNE1-high mean: "
## [1] 70301.04

## [1] "FTL"
## [1] "Control mean: "
## [1] 39652.52
## [1] "CCNE1-high mean: "
## [1] 58126.86

## [1] "TFRC"
## [1] "Control mean: "
## [1] 3350.357
## [1] "CCNE1-high mean: "
## [1] 4315.588

## [1] "SLC7A11"
## [1] "Control mean: "
## [1] 373.0885
## [1] "CCNE1-high mean: "
## [1] 338.5149

## [1] "SLC11A2"
## [1] "Control mean: "
## [1] 2261.683
## [1] "CCNE1-high mean: "
## [1] 1963.855