import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px

C:\Users\ASUS\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.3' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
C:\Users\ASUS\AppData\Roaming\Python\Python39\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

# Load the dataset from a CSV file
iris_data = pd.read_csv('Iris.csv')

# Drop the 'Id' column as it's not useful for analysis
iris_data = iris_data.drop(columns=['Id'])

# View the first few rows of the dataset
print(iris_data.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0            5.1           3.5            1.4           0.2  Iris-setosa
1            4.9           3.0            1.4           0.2  Iris-setosa
2            4.7           3.2            1.3           0.2  Iris-setosa
3            4.6           3.1            1.5           0.2  Iris-setosa
4            5.0           3.6            1.4           0.2  Iris-setosa

# Check for missing values
print(iris_data.isnull().sum())

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

# Separate features and target variable
features = iris_data.drop(columns=['Species'])
species = iris_data['Species']

# Standardize the feature data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Convert the normalized features back to a DataFrame
normalized_features_df = pd.DataFrame(normalized_features, columns=features.columns)

# Print the normalized data
print(normalized_features_df.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0      -0.900681      1.032057      -1.341272     -1.312977
1      -1.143017     -0.124958      -1.341272     -1.312977
2      -1.385353      0.337848      -1.398138     -1.312977
3      -1.506521      0.106445      -1.284407     -1.312977
4      -1.021849      1.263460      -1.341272     -1.312977

# Apply K-Means clustering with 3 clusters (since there are 3 species)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(normalized_features)

# Add the cluster labels to the original dataframe
iris_data['Cluster'] = kmeans.labels_

# Show the first few rows with the cluster labels
print(iris_data.head())

C:\Users\ASUS\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0            5.1           3.5            1.4           0.2  Iris-setosa   
1            4.9           3.0            1.4           0.2  Iris-setosa   
2            4.7           3.2            1.3           0.2  Iris-setosa   
3            4.6           3.1            1.5           0.2  Iris-setosa   
4            5.0           3.6            1.4           0.2  Iris-setosa   

   Cluster  
0        1  
1        1  
2        1  
3        1  
4        1

# Plot a pairplot to visualize the distribution of features by cluster
sns.pairplot(iris_data, hue='Cluster', diag_kind='kde', palette='Set1')
plt.show()

plt.figure(figsize=(8,6))
sns.scatterplot(x=normalized_features_df['SepalLengthCm'], 
                y=normalized_features_df['SepalWidthCm'], 
                hue=iris_data['Cluster'], 
                palette='Set1')
plt.title('2D Scatter Plot: Sepal Length vs Sepal Width')
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset (Iris dataset for example)
from sklearn.datasets import load_iris
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Generate correlation matrix
correlation_matrix = df.corr()

# Create heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()

# 3D Scatter Plot using Matplotlib
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the points
ax.scatter(normalized_features_df['SepalLengthCm'], 
           normalized_features_df['SepalWidthCm'], 
           normalized_features_df['PetalLengthCm'], 
           c=iris_data['Cluster'], cmap='Set1')

ax.set_xlabel('Sepal Length')
ax.set_ylabel('Sepal Width')
ax.set_zlabel('Petal Length')
plt.title('3D Scatter Plot')
plt.show()

# 3D Scatter Plot using Plotly for interactive visualization
fig = px.scatter_3d(iris_data, 
                    x='SepalLengthCm', 
                    y='SepalWidthCm', 
                    z='PetalLengthCm', 
                    color='Cluster',
                    labels={'Cluster': 'Cluster Group'},
                    title='3D Interactive Scatter Plot')
fig.show()

Iris Dataset Clustering and Visualization¶

Introduction¶

Key Features:¶

Libraries Used:¶

Author¶

Project Goals:¶

Dataset¶

Step 1: Import Libraries¶

Step 2: Load the Dataset¶

The Iris dataset consists of the following columns:¶

Project Overview¶

Data Preprocessing:¶

K-Means Clustering:¶

2D Visualizations:¶

3D Visualization:¶

Step 3: Data Preprocessing¶

1. Check for Missing Values:¶

2. Normalize the Feature Data:¶

Step 4: Apply K-Means Clustering¶

Step 5: 2D Visualizations¶

1. Pairplot using Seaborn:¶

2. Scatter Plot of Sepal Length vs Sepal Width:¶

3. Correlation Heatmap:¶

Key Insights:¶

Summary:¶

Step 6: 3D Visualizations¶

1. 3D Scatter Plot using Matplotlib:¶

2. Interactive 3D Plot using Plotly:¶

Conclusion¶