import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the Titanic dataset from seaborn
data = sns.load_dataset('titanic')

# Feature Engineering: Create a new feature - Fare per Person
data['fare_per_person'] = data['fare'] / (data['sibsp'] + data['parch'] + 1)

# Create a new feature - IsAlone
data['is_alone'] = 0
data.loc[(data['sibsp'] == 0) & (data['parch'] == 0), 'is_alone'] = 1

# Display the first few rows to confirm the new features
print(data.head())

C:\Users\ASUS\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.3' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
C:\Users\ASUS\AppData\Roaming\Python\Python39\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  fare_per_person  is_alone  
0    man        True  NaN  Southampton    no  False          3.62500         0  
1  woman       False    C    Cherbourg   yes  False         35.64165         0  
2  woman       False  NaN  Southampton   yes   True          7.92500         1  
3  woman       False    C  Southampton   yes  False         26.55000         0  
4    man        True  NaN  Southampton    no   True          8.05000         1

# Select numerical features to include in the correlation heatmap
numerical_features = ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'fare_per_person', 'is_alone']

# Compute the correlation matrix
corr_matrix = data[numerical_features].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Titanic Dataset Features')
plt.show()

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Select the features for PCA (excluding the target variable 'survived')
features = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'fare_per_person', 'is_alone']
X = data[features].dropna()  # Drop missing values for simplicity
y = data.loc[X.index, 'survived']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot the PCA result
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='coolwarm')
plt.title('PCA of Titanic Dataset')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data for model building (ensure no missing values)
X = data[features].fillna(0)
y = data['survived'].fillna(0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature importance analysis
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=np.array(features)[indices], palette='viridis')
plt.title('Feature Importance - Random Forest')
plt.show()

Accuracy: 0.6716417910447762

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.76      0.73       157
           1       0.62      0.55      0.58       111

    accuracy                           0.67       268
   macro avg       0.66      0.65      0.66       268
weighted avg       0.67      0.67      0.67       268

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px



# Re-select features (to avoid using previous X with missing data)
features = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'fare_per_person', 'is_alone']

# Drop rows with missing values
data_clean = data.dropna(subset=features + ['survived'])

# Standardize the features
X_scaled = StandardScaler().fit_transform(data_clean[features])

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])

# Align the 'Survived' values with the PCA DataFrame
pca_df['Survived'] = data_clean['survived'].values

# Plotly Scatter Plot
fig = px.scatter(pca_df, x='PC1', y='PC2', color='Survived',
                 title='Interactive PCA of Titanic Dataset',
                 labels={'PC1': 'First Principal Component', 'PC2': 'Second Principal Component'})
fig.show()

# Create a DataFrame for feature importance
importance_df = pd.DataFrame({
    'Feature': np.array(features)[indices],
    'Importance': importances[indices]
})

# Plotly Bar Chart
fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h',
             title='Interactive Feature Importance - Random Forest',
             labels={'Importance': 'Importance', 'Feature': 'Feature'})
fig.show()

from lifelines import KaplanMeierFitter

# Ensure 'age' is numeric and drop rows with missing values in 'age' and 'survived'
data_clean = data.dropna(subset=['age', 'survived']).copy()
data_clean.loc[:, 'age'] = pd.to_numeric(data_clean['age'], errors='coerce')

# Recreate age groups based on the clean data
data_clean.loc[:, 'age_group'] = pd.cut(data_clean['age'], bins=[0, 30, 80], labels=['0-30', '30+'])

# Initialize the KaplanMeierFitter
kmf = KaplanMeierFitter()

# Plot the Kaplan-Meier curves for each age group
plt.figure(figsize=(10, 6))
for age_group in data_clean['age_group'].unique():
    mask = data_clean['age_group'] == age_group
    if not mask.any():
        continue
    kmf.fit(durations=data_clean.loc[mask, 'age'], 
            event_observed=data_clean.loc[mask, 'survived'], 
            label=str(age_group))
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Curve - Age Groups')
plt.xlabel('Age')
plt.ylabel('Survival Probability')
plt.show()

Titanic Data Analysis & Visualization¶

Introduction¶

About the Author¶

Project Overview¶

Correlation Heatmap¶

Principal Component Analysis (PCA)¶

Advanced Model Building¶

Interactive Visualizations¶

Interactive Feature Importance Bar Chart¶

Kaplan-Meier Survival Curves¶