AI/ML Algorithms - 2
Here is the list of algorithms from part 1:
Part 4
Part 5
Part 6
6. Random Forests (Supervised):
Imagine you're trying to decide whether to see a new movie. You could ask one friend for their opinion, but their taste might be very different from yours. A better approach would be to ask many friends with diverse tastes, and then make your decision based on a consensus of their opinions. Thats the core idea behind Random Forests.
In more technical terms, a Random Forest is an ensemble learning method that combines multiple decision trees to make a final prediction. Instead of relying on a single decision tree, which can be prone to overfitting (performing well on training data but poorly on new, unseen data), a Random Forest aggregates the predictions of many individual trees, making it a more robust and accurate model.
in our example, we are going to predict whether it is going to rain based on many features like temperature, humidity, wind speed, etc. The app writes the synthetic data into a CSV file for you to have a look at it.
Here is the implementation:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def generate_weather_data(n_samples=1000):
"""Generate synthetic weather data for rain prediction"""
np.random.seed(42)
# Generate features with realistic ranges
temperature = np.random.normal(25, 5, n_samples) # Celsius
humidity = np.random.normal(65, 15, n_samples) # Percentage
wind_speed = np.random.normal(15, 5, n_samples) # km/h
cloud_cover = np.random.normal(50, 20, n_samples) # Percentage
pressure = np.random.normal(1013, 10, n_samples) # hPa
# Create realistic rain conditions based on weather features
rain = np.zeros(n_samples)
for i in range(n_samples):
# Higher chance of rain with:
# - High humidity (>70%)
# - High cloud cover (>60%)
# - Low pressure (<1010 hPa)
# - Moderate wind speed (10-20 km/h)
rain_prob = 0
if humidity[i] > 70:
rain_prob += 0.3
if cloud_cover[i] > 60:
rain_prob += 0.3
if pressure[i] < 1010:
rain_prob += 0.2
if 10 <= wind_speed[i] <= 20:
rain_prob += 0.2
# Temperature effect (less likely to rain in very hot or cold conditions)
if temperature[i] > 30 or temperature[i] < 10:
rain_prob *= 0.5
# Add some randomness
rain_prob += np.random.normal(0, 0.1)
# Convert probability to binary outcome
rain[i] = 1 if rain_prob > 0.5 else 0
# Create DataFrame
data = pd.DataFrame({
'temperature': temperature,
'humidity': humidity,
'wind_speed': wind_speed,
'cloud_cover': cloud_cover,
'pressure': pressure,
'rain': rain
})
return data
def display_dataset_info(data):
"""Display information about the dataset"""
print("\nDataset Information:")
print(f"Number of samples: {len(data)}")
print(f"Number of features: {len(data.columns) - 1}") # Excluding target
print(f"Rain distribution:\n{data['rain'].value_counts()}")
print("\nFirst 10 rows of the dataset:")
print(data.head(10))
print("\nDataset Statistics:")
print(data.describe())
# Save to CSV
data.to_csv('weather_dataset.csv', index=False)
print("\nDataset saved to 'weather_dataset.csv'")
def plot_feature_distributions(data):
"""Plot distributions of features by rain status"""
plt.figure(figsize=(15, 12))
features = ['temperature', 'humidity', 'wind_speed', 'cloud_cover', 'pressure']
for i, feature in enumerate(features):
plt.subplot(3, 2, i+1)
sns.histplot(data=data, x=feature, hue='rain', kde=True)
plt.title(f'Distribution of {feature.replace("_", " ").title()}')
plt.tight_layout()
plt.savefig('weather_distributions.png')
plt.close()
def plot_feature_importance(model, feature_names):
"""Plot feature importance"""
plt.figure(figsize=(10, 6))
importance = pd.Series(model.feature_importances_, index=feature_names)
importance.sort_values().plot(kind='barh')
plt.title('Feature Importance for Rain Prediction')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()
def main():
# Generate weather data
print("Generating weather data...")
data = generate_weather_data(n_samples=1000)
# Display dataset information and save to CSV
display_dataset_info(data)
# Split data into features and target
X = data[['temperature', 'humidity', 'wind_speed', 'cloud_cover', 'pressure']].values
y = data['rain'].values
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(
n_estimators=100,
max_depth=5,
random_state=42
)
rf.fit(X_train, y_train)
# Make predictions
y_pred = rf.predict(X_test)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred),
annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Rain Prediction')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()
# Generate visualizations
print("\nGenerating visualizations...")
plot_feature_distributions(data)
plot_feature_importance(rf, ['Temperature', 'Humidity', 'Wind Speed', 'Cloud Cover', 'Pressure'])
print("\nVisualizations saved as:")
print("- weather_distributions.png")
print("- feature_importance.png")
print("- confusion_matrix.png")
if __name__ == "__main__":
main()
Outputs:
random_forests % python random_forest_example.py
Generating weather data...
Dataset Information:
Number of samples: 1000
Number of features: 5
Rain distribution:
rain
0.0 667
1.0 333
Name: count, dtype: int64
First 10 rows of the dataset:
temperature humidity wind_speed cloud_cover pressure rain
0 27.483571 85.990332 11.624109 11.843849 1004.365064 1.0
1 24.308678 78.869505 14.277407 32.792300 1012.687965 0.0
2 28.238443 65.894456 11.037900 41.727889 1013.180169 0.0
3 32.615149 55.295948 13.460192 87.753753 1017.726303 0.0
4 23.829233 75.473350 5.531927 61.131062 999.331416 1.0
5 23.829315 70.902281 16.066469 23.290369 1018.925673 0.0
6 32.896064 78.427898 15.006027 59.720726 985.956084 0.0
7 28.837174 74.527577 10.914557 19.053920 1006.701154 1.0
8 22.652628 80.743291 18.296228 71.653821 1008.117262 1.0
9 27.712800 56.971472 19.687851 40.577507 1019.333268 0.0
Dataset Statistics:
temperature humidity wind_speed cloud_cover pressure rain
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 25.096660 66.062544 15.029171 49.625616 1012.507264 0.333000
std 4.896080 14.961816 4.917271 20.542651 9.923802 0.471522
min 8.793663 20.894170 -0.097561 -8.588974 981.232962 0.000000
25% 21.762048 55.906375 11.760002 35.251591 1006.173950 0.000000
50% 25.126503 65.946157 14.998746 50.003691 1012.817580 0.000000
75% 28.239719 75.933233 18.304577 63.338908 1019.391231 1.000000
max 44.263657 112.896614 34.631189 114.861859 1044.129102 1.000000
Dataset saved to 'weather_dataset.csv'
Training Random Forest...
Model Accuracy: 0.8800
Classification Report:
precision recall f1-score support
0.0 0.90 0.94 0.92 140
1.0 0.83 0.75 0.79 60
accuracy 0.88 200
macro avg 0.87 0.84 0.85 200
weighted avg 0.88 0.88 0.88 200
Generating visualizations...
Visualizations saved as:
- weather_distributions.png
- feature_importance.png
- confusion_matrix.png
Temperature (in Celsius)
Humidity (in percentage)
Wind Speed (in km/h)
Cloud Cover (in percentage)
Pressure (in hPa)
The target variable is 'rain' (0 or 1), determined by:
The dataset will be saved as 'weather_dataset.csv' and will include:
The visualizations will show:
Confusion Matrix (though not shown in the output, we can infer from the classification report):
Classification Report shows:
Class 0 (No Rain):
- Precision: 0.90 (90% of predicted "no rain" cases were correct)
- Recall: 0.94 (94% of actual "no rain" cases were correctly identified)
- F1-score: 0.92 (harmonic mean of precision and recall)
Class 1 (Rain):
- Precision: 0.83 (83% of predicted "rain" cases were correct)
- Recall: 0.75 (75% of actual "rain" cases were correctly identified)
- F1-score: 0.79 (harmonic mean of precision and recall)
Overall Performance:
Actual vs Predicted Labels:
Performance Metrics:
Interpretation:
7. K-Means Clustering (Unsupervised):
Being an unsupervised algorithm, when the dataset is received, the model will not know about the customers' segments. It will find the pattern using one of the statistical calculations:
In this example, we got a user spending data for different locations. We need to find the patterns based on their spending habits and then promote products to them.
Please check the outputs below, where it finds more interesting patterns that the marketing team can use it to fine-tune their approaches.
Here is the implementation:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
def generate_customer_data(n_samples=1000):
"""Generate synthetic customer data with realistic marketing features"""
np.random.seed(42)
# Generate features with realistic distributions
# High spenders (20% of customers)
high_spenders = int(n_samples * 0.2)
total_spent_high = np.random.normal(7500, 1000, high_spenders)
frequency_high = np.random.normal(10, 1, high_spenders)
# Medium spenders (50% of customers)
medium_spenders = int(n_samples * 0.5)
total_spent_medium = np.random.normal(3500, 500, medium_spenders)
frequency_medium = np.random.normal(5, 1, medium_spenders)
# Low spenders (30% of customers)
low_spenders = n_samples - high_spenders - medium_spenders
total_spent_low = np.random.normal(1500, 300, low_spenders)
frequency_low = np.random.normal(2, 0.5, low_spenders)
# Combine spending data
total_spent = np.concatenate([total_spent_high, total_spent_medium, total_spent_low])
frequency = np.concatenate([frequency_high, frequency_medium, frequency_low])
# Calculate average transaction
avg_transaction = total_spent / (frequency * 12)
# Generate product preferences
# Electronics preference (higher for younger customers)
age = np.random.normal(35, 15, n_samples)
electronics_ratio = np.clip(0.1 + (35 - age) / 100 + np.random.normal(0, 0.1, n_samples), 0, 1)
# Clothing preference (relatively stable)
clothing_ratio = np.random.normal(0.3, 0.1, n_samples)
# Groceries (remaining ratio)
groceries_ratio = 1 - electronics_ratio - clothing_ratio
# Location (urban vs suburban)
# Higher probability of urban for higher spenders
urban_prob = np.clip(total_spent / 10000, 0.1, 0.9)
location = np.random.binomial(1, urban_prob)
# Create DataFrame
df = pd.DataFrame({
'total_spent': total_spent,
'avg_transaction': avg_transaction,
'frequency': frequency,
'electronics_ratio': electronics_ratio,
'clothing_ratio': clothing_ratio,
'groceries_ratio': groceries_ratio,
'age': age,
'location': location
})
# Ensure ratios sum to 1
ratio_cols = ['electronics_ratio', 'clothing_ratio', 'groceries_ratio']
df[ratio_cols] = df[ratio_cols].div(df[ratio_cols].sum(axis=1), axis=0)
return df
def plot_customer_clusters(data, labels, title):
"""Plot customer clusters using key features"""
plt.figure(figsize=(15, 10))
# Plot 1: Spending vs Frequency
plt.subplot(2, 2, 1)
sns.scatterplot(
x='total_spent',
y='frequency',
hue=labels,
data=data,
palette='viridis',
alpha=0.7
)
plt.title('Total Spending vs Purchase Frequency')
plt.xlabel('Total Amount Spent ($)')
plt.ylabel('Monthly Purchase Frequency')
# Plot 2: Age vs Electronics Ratio
plt.subplot(2, 2, 2)
sns.scatterplot(
x='age',
y='electronics_ratio',
hue=labels,
data=data,
palette='viridis',
alpha=0.7
)
plt.title('Age vs Electronics Spending Ratio')
plt.xlabel('Age')
plt.ylabel('Electronics Spending Ratio')
# Plot 3: Location vs Average Transaction
plt.subplot(2, 2, 3)
sns.boxplot(
x='location',
y='avg_transaction',
hue=labels,
data=data,
palette='viridis'
)
plt.title('Location vs Average Transaction Value')
plt.xlabel('Location (0: Suburban, 1: Urban)')
plt.ylabel('Average Transaction Value ($)')
plt.tight_layout()
plt.savefig(f'{title.lower().replace(" ", "_")}.png')
plt.close()
def analyze_clusters(data, labels):
"""Analyze and interpret discovered customer segments"""
print("\nDiscovered Customer Segments Analysis:")
for cluster_id in np.unique(labels):
cluster_data = data[labels == cluster_id]
# Calculate segment characteristics
avg_spent = cluster_data['total_spent'].mean()
avg_freq = cluster_data['frequency'].mean()
avg_age = cluster_data['age'].mean()
urban_ratio = cluster_data['location'].mean()
electronics_ratio = cluster_data['electronics_ratio'].mean()
# Determine segment type based on characteristics
if avg_spent > 6000 and avg_freq > 8:
segment_type = "High-Value Frequent Shoppers"
elif avg_spent < 2000 and avg_freq < 3:
segment_type = "Budget-Conscious Infrequent Shoppers"
elif avg_age < 30 and electronics_ratio > 0.5:
segment_type = "Young Tech Enthusiasts"
elif urban_ratio > 0.7 and avg_spent > 4000:
segment_type = "Urban Premium Shoppers"
else:
segment_type = "General Shoppers"
print(f"\nSegment {cluster_id + 1} ({segment_type}):")
print(f"Number of customers: {len(cluster_data)}")
print(f"Average total spent: ${avg_spent:.2f}")
print(f"Average monthly frequency: {avg_freq:.2f}")
print(f"Average age: {avg_age:.1f}")
print(f"Urban customers: {urban_ratio*100:.1f}%")
print("Average spending ratios:")
print(f" Electronics: {electronics_ratio*100:.1f}%")
print(f" Clothing: {cluster_data['clothing_ratio'].mean()*100:.1f}%")
print(f" Groceries: {cluster_data['groceries_ratio'].mean()*100:.1f}%")
# Marketing recommendations
print("\nMarketing Recommendations:")
if "High-Value" in segment_type:
print("- Premium loyalty program")
print("- Early access to new products")
print("- Personalized shopping experiences")
elif "Budget-Conscious" in segment_type:
print("- Discount and sale notifications")
print("- Budget-friendly product recommendations")
print("- Value bundles and packages")
elif "Tech Enthusiasts" in segment_type:
print("- Latest tech product announcements")
print("- Tech accessories and upgrades")
print("- Gaming and entertainment offers")
elif "Urban Premium" in segment_type:
print("- Premium urban lifestyle products")
print("- Convenience-focused services")
print("- Local store events and experiences")
else:
print("- General promotions")
print("- Seasonal offers")
print("- Cross-category recommendations")
def main():
# Generate customer data
print("Generating customer data...")
data = generate_customer_data(n_samples=1000)
# Save data to CSV
data.to_csv('customer_data.csv', index=False)
print("Data saved to 'customer_data.csv'")
# Display data information
print("\nDataset Information:")
print(f"Number of customers: {len(data)}")
print(f"Number of features: {len(data.columns)}")
print("\nFirst 5 rows:")
print(data.head())
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)
# Perform k-means clustering
print("\nPerforming k-means clustering...")
kmeans = KMeans(n_clusters=4, random_state=42) # Let's try 4 clusters
kmeans.fit(X_scaled)
# Add cluster labels to data
data['discovered_segment'] = kmeans.labels_
# Plot customer clusters
plot_customer_clusters(data, kmeans.labels_, "Discovered Customer Segments")
# Analyze and interpret clusters
analyze_clusters(data, kmeans.labels_)
# Evaluate clustering
silhouette_avg = silhouette_score(X_scaled, kmeans.labels_)
print(f"\nClustering Evaluation:")
print(f"Silhouette Score: {silhouette_avg:.4f}")
print(f"Inertia: {kmeans.inertia_:.2f}")
# Save results
data.to_csv('clustering_results.csv', index=False)
print("\nResults saved to 'clustering_results.csv'")
if __name__ == "__main__":
main()
Outputs:
k-means_clustering % python kmeans_example.py
Generating customer data...
Data saved to 'customer_data.csv'
Dataset Information:
Number of customers: 1000
Number of features: 8
First 5 rows:
total_spent avg_transaction frequency electronics_ratio clothing_ratio groceries_ratio age location
0 7996.714153 64.337375 10.357787 0.010496 0.213651 0.775853 24.872326 1
1 7361.735699 58.090190 10.560785 0.035639 0.296880 0.667481 32.832220 1
2 8147.688538 61.262375 11.083051 0.177502 0.301802 0.520696 23.113701 1
3 9023.029856 68.023577 11.053802 0.334963 0.347263 0.317774 30.380577 1
4 7265.846625 70.223150 8.622331 0.439698 0.163314 0.396988 6.595780 1
Performing k-means clustering...
Discovered Customer Segments Analysis:
Segment 1 (General Shoppers):
Number of customers: 78
Average total spent: $2419.23
Average monthly frequency: 1.93
Average age: 36.9
Urban customers: 20.5%
Average spending ratios:
Electronics: 9.6%
Clothing: 31.3%
Groceries: 59.1%
Marketing Recommendations:
- General promotions
- Seasonal offers
- Cross-category recommendations
Segment 2 (General Shoppers):
Number of customers: 449
Average total spent: $2759.87
Average monthly frequency: 4.12
Average age: 42.9
Urban customers: 26.5%
Average spending ratios:
Electronics: 5.0%
Clothing: 27.5%
Groceries: 67.5%
Marketing Recommendations:
- General promotions
- Seasonal offers
- Cross-category recommendations
Segment 3 (General Shoppers):
Number of customers: 275
Average total spent: $2942.26
Average monthly frequency: 4.25
Average age: 22.2
Urban customers: 28.0%
Average spending ratios:
Electronics: 28.2%
Clothing: 32.4%
Groceries: 39.4%
Marketing Recommendations:
- General promotions
- Seasonal offers
- Cross-category recommendations
Segment 4 (High-Value Frequent Shoppers):
Number of customers: 198
Average total spent: $7416.73
Average monthly frequency: 10.01
Average age: 34.6
Urban customers: 75.8%
Average spending ratios:
Electronics: 11.5%
Clothing: 29.4%
Groceries: 59.1%
Marketing Recommendations:
- Premium loyalty program
- Early access to new products
- Personalized shopping experiences
Clustering Evaluation:
Silhouette Score: 0.2441
Inertia: 4453.56
1. Discovered Customer Segments:
The algorithm identified 4 natural customer segments based on their characteristics:
2. Clustering Quality:
3. Marketing Implications:
4. Business Value:
Recommended by LinkedIn
More analysis of the outputs:
2. Discovered Segments:
a) Segment 4 - High-Value Frequent Shoppers (198 customers)
b) Segment 3 - Young Tech-Oriented Shoppers (275 customers)
c) Segment 2 - Mainstream Shoppers (449 customers)
d) Segment 1 - Budget Shoppers (78 customers)
3. Clustering Quality Metrics:
4. Business Implications:
a) Resource Allocation:
b) Marketing Strategies:
c) Product Development:
PITFALLS:
8. K-Nearest Neighbors (KNN) (Supervised):
How it works:
5. Parameter "k": The value of "k" is a hyperparameter that needs to be chosen appropriately, and its impact on the model's performance can be significant.
This example classifies apples and oranges based on their weight and texture. When you run it, it saves the dataset in fruits_dataset.csv file for reference.
Once the plot is done, it prints out the predicted and actual values for randomly selected fruits.
Here is the implementation:
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
class KNN:
def __init__(self, k=3):
self.k = k
def fit(self, X, y):
"""Store the training data"""
self.X_train = X
self.y_train = y
def euclidean_distance(self, x1, x2):
"""Calculate Euclidean distance between two points"""
return np.sqrt(np.sum((x1 - x2) ** 2))
def get_neighbors(self, x):
"""Find k nearest neighbors of x"""
distances = []
for i in range(len(self.X_train)):
dist = self.euclidean_distance(x, self.X_train[i])
distances.append((dist, self.y_train[i]))
# Sort distances and get k nearest neighbors
distances.sort(key=lambda x: x[0])
return distances[:self.k]
def predict_classification(self, x):
"""Predict class for a single sample"""
neighbors = self.get_neighbors(x)
# Get the most common class among neighbors
classes = [neighbor[1] for neighbor in neighbors]
return Counter(classes).most_common(1)[0][0]
def predict(self, X):
"""Predict for multiple samples"""
return np.array([self.predict_classification(x) for x in X])
def plot_decision_boundary(X, y, model, title):
"""Plot decision boundary for 2D data"""
# Create a mesh grid with fewer points
h = 0.5 # Increased step size to reduce number of points
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Make predictions for the mesh points
mesh_points = np.c_[xx.ravel(), yy.ravel()]
# Process mesh points in batches to avoid memory issues
batch_size = 1000
Z = np.zeros(len(mesh_points))
for i in range(0, len(mesh_points), batch_size):
batch = mesh_points[i:i + batch_size]
Z[i:i + batch_size] = model.predict(batch)
Z = Z.reshape(xx.shape)
# Create the plot
plt.figure(figsize=(10, 8))
# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', marker='o', cmap='RdYlBu')
# Add labels and title
plt.title(title)
plt.xlabel('Weight (grams)')
plt.ylabel('Texture (1-10)')
# Add a colorbar
plt.colorbar(label='Fruit Type (0=Apple, 1=Orange)')
# Show the plot
plt.show()
def create_fruit_dataset():
"""Create a dataset of apples and oranges based on weight and texture"""
# Apple data (class 0)
apple_weights = np.random.normal(150, 20, 50) # Mean weight 150g
apple_textures = np.random.normal(7, 1, 50) # Mean texture 7
# Orange data (class 1)
orange_weights = np.random.normal(200, 25, 50) # Mean weight 200g
orange_textures = np.random.normal(4, 1, 50) # Mean texture 4
# Combine the data
X = np.vstack([
np.column_stack((apple_weights, apple_textures)),
np.column_stack((orange_weights, orange_textures))
])
# Create labels (0 for apples, 1 for oranges)
y = np.array([0] * 50 + [1] * 50)
# Create a DataFrame and save to CSV
df = pd.DataFrame({
'weight': X[:, 0],
'texture': X[:, 1],
'fruit': ['Apple' if label == 0 else 'Orange' for label in y]
})
df.to_csv('fruit_dataset.csv', index=False)
return X, y
def main():
# Create the fruit dataset
X, y = create_fruit_dataset()
# Split the data (80% training, 20% testing)
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# Create and train the model
knn = KNN(k=3)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Classification Accuracy: {accuracy:.2f}")
# Plot decision boundary
plot_decision_boundary(X_train, y_train, knn, "KNN Decision Boundary (Apples vs Oranges)")
# Print random example predictions
print("\nExample Predictions:")
print("Weight (g) | Texture | Predicted Fruit | Actual Fruit")
print("-" * 50)
# Get 5 random indices from test set
random_indices = np.random.choice(len(X_test), 5, replace=False)
for idx in random_indices:
weight, texture = X_test[idx]
prediction = "Apple" if y_pred[idx] == 0 else "Orange"
actual = "Apple" if y_test[idx] == 0 else "Orange"
print(f"{weight:9.1f} | {texture:7.1f} | {prediction:14s} | {actual}")
if __name__ == "__main__":
main()
Outputs:
knn % python knn_implementation.py
Classification Accuracy: 0.95
Example Predictions:
Weight (g) | Texture | Predicted Fruit | Actual Fruit
--------------------------------------------------
226.4 | 3.3 | Orange | Orange
234.2 | 3.8 | Orange | Orange
157.7 | 3.7 | Apple | Orange
188.5 | 5.1 | Orange | Orange
174.4 | 3.6 | Orange | Orange
9. Decision Trees (Supervised):
I've created a complete decision tree implementation with the following components:
1. decision_tree_implementation.py:
2. requirements.txt:
3. README.md:
The dataset is generated with realistic patterns:
The target variable (Pass/Fail) is generated based on rules that consider:
To run the implementation:
pip install -r requirements.txt
python decision_tree_implementation.py
This will:
Here is the implementation:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
def create_student_dataset(n_samples=100):
"""Create a synthetic dataset for student performance prediction"""
np.random.seed(42) # For reproducibility
# Generate features
study_hours = np.random.normal(5, 1.5, n_samples) # Mean 5 hours, std 1.5
attendance = np.random.normal(85, 10, n_samples) # Mean 85%, std 10%
previous_score = np.random.normal(75, 15, n_samples) # Mean 75%, std 15%
sleep_hours = np.random.normal(7, 1, n_samples) # Mean 7 hours, std 1
# Create a DataFrame
df = pd.DataFrame({
'study_hours': study_hours,
'attendance': attendance,
'previous_score': previous_score,
'sleep_hours': sleep_hours
})
# Generate target variable (Pass/Fail) based on rules
# Students are more likely to pass if they:
# - Study more than 4 hours
# - Have attendance above 80%
# - Have previous score above 70%
# - Sleep between 6-8 hours
pass_probability = (
(df['study_hours'] > 4).astype(int) * 0.3 +
(df['attendance'] > 80).astype(int) * 0.3 +
(df['previous_score'] > 70).astype(int) * 0.3 +
((df['sleep_hours'] >= 6) & (df['sleep_hours'] <= 8)).astype(int) * 0.1
)
# Add some randomness
pass_probability += np.random.normal(0, 0.1, n_samples)
# Convert to binary outcome
df['result'] = (pass_probability > 0.5).astype(int)
# Save to CSV
df.to_csv('student_dataset.csv', index=False)
return df
def train_decision_tree(X, y):
"""Train a decision tree classifier"""
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the model
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf.fit(X_train, y_train)
# Make predictions
y_pred = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
return clf, X_train, X_test, y_train, y_test
def plot_decision_tree(clf, feature_names):
"""Plot the decision tree"""
plt.figure(figsize=(20,10))
plot_tree(clf, feature_names=list(feature_names), class_names=['Fail', 'Pass'],
filled=True, rounded=True, fontsize=10)
plt.savefig('decision_tree.png')
plt.show()
plt.close()
def main():
# Create the dataset
print("Creating student performance dataset...")
df = create_student_dataset(100)
# Prepare features and target
X = df[['study_hours', 'attendance', 'previous_score', 'sleep_hours']]
y = df['result']
# Train the model
print("\nTraining decision tree model...")
clf, X_train, X_test, y_train, y_test = train_decision_tree(X, y)
# Plot the decision tree
print("\nPlotting decision tree...")
plot_decision_tree(clf, X.columns)
# Print some example predictions
print("\nExample Predictions:")
print("Study Hours | Attendance | Previous Score | Sleep Hours | Predicted Result")
print("-" * 75)
# Get 5 random test examples
random_indices = np.random.choice(len(X_test), 5, replace=False)
for idx in random_indices:
features = X_test.iloc[idx]
prediction = "Pass" if clf.predict([features])[0] == 1 else "Fail"
actual = "Pass" if y_test.iloc[idx] == 1 else "Fail"
print(f"{features['study_hours']:11.1f} | {features['attendance']:10.1f} | "
f"{features['previous_score']:14.1f} | {features['sleep_hours']:11.1f} | "
f"{prediction:14s} (Actual: {actual})")
if __name__ == "__main__":
main()
Outputs:
decision_tree % python decision_tree_implementation.py
Creating student performance dataset...
Training decision tree model...
Model Accuracy: 0.85
Classification Report:
precision recall f1-score support
0 0.60 0.75 0.67 4
1 0.93 0.88 0.90 16
accuracy 0.85 20
macro avg 0.77 0.81 0.78 20
weighted avg 0.87 0.85 0.86 20
Plotting decision tree...
Example Predictions:
Study Hours | Attendance | Previous Score | Sleep Hours | Predicted Result
---------------------------------------------------------------------------
/Users/selvanrajan/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
5.1 | 80.5 | 71.9 | 6.1 | Pass (Actual: Pass)
/Users/selvanrajan/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
4.1 | 69.5 | 64.0 | 7.0 | Fail (Actual: Fail)
/Users/selvanrajan/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
4.6 | 83.4 | 54.3 | 7.0 | Pass (Actual: Pass)
/Users/selvanrajan/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
7.8 | 85.7 | 78.2 | 6.1 | Pass (Actual: Pass)
/Users/selvanrajan/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
5.5 | 76.1 | 96.6 | 7.0 | Pass (Actual: Pass)
1. Feature Importance Calculation:
2. Root Node Selection:
3. How it's determined:
When you run the program now, you'll see: