DDC_Skills_for_AI_Agents_in_Construction duration-prediction

Predict project duration using k-NN and regression. Estimate timeline based on similar historical projects.

install

source · Clone the upstream repo

git clone https://github.com/datadrivenconstruction/DDC_Skills_for_AI_Agents_in_Construction

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/datadrivenconstruction/DDC_Skills_for_AI_Agents_in_Construction "$T" && mkdir -p ~/.claude/skills && cp -r "$T/2_DDC_Book/4.5-ML-Cost-Prediction/duration-prediction" ~/.claude/skills/datadrivenconstruction-ddc-skills-for-ai-agents-in-construction-duration-predict && rm -rf "$T"

manifest: 2_DDC_Book/4.5-ML-Cost-Prediction/duration-prediction/SKILL.md

source content

Duration Prediction

Business Case

Problem Statement

Project duration estimation challenges:

Subjective expert estimates
Lack of historical benchmarking
Inaccurate early-stage predictions
Difficulty comparing similar projects

Solution

Machine learning-based duration prediction using k-Nearest Neighbors and regression models trained on historical project data.

Technical Implementation

import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass, field
from datetime import date
from enum import Enum
import math


class ModelType(Enum):
    KNN = "knn"
    LINEAR_REGRESSION = "linear_regression"
    WEIGHTED_KNN = "weighted_knn"


class ProjectType(Enum):
    OFFICE = "office"
    RESIDENTIAL = "residential"
    INDUSTRIAL = "industrial"
    RETAIL = "retail"
    HEALTHCARE = "healthcare"
    EDUCATION = "education"


@dataclass
class ProjectFeatures:
    project_id: str
    project_type: ProjectType
    size_sf: float
    floors: int
    complexity: int  # 1-5
    location_factor: float  # Cost adjustment factor
    has_basement: bool = False
    is_renovation: bool = False
    actual_duration: Optional[int] = None  # Days


@dataclass
class PredictionResult:
    predicted_duration: int
    confidence_interval: Tuple[int, int]
    similar_projects: List[str]
    model_used: ModelType
    features_importance: Dict[str, float]


class DurationPredictor:
    """Predict project duration using ML techniques."""

    def __init__(self):
        self.training_data: List[ProjectFeatures] = []
        self.feature_weights: Dict[str, float] = {
            'size_sf': 0.30,
            'floors': 0.15,
            'complexity': 0.25,
            'location_factor': 0.10,
            'has_basement': 0.10,
            'is_renovation': 0.10
        }
        self.type_baseline_days: Dict[ProjectType, Dict[str, float]] = {
            ProjectType.OFFICE: {'base': 300, 'per_1000sf': 0.5},
            ProjectType.RESIDENTIAL: {'base': 240, 'per_1000sf': 0.4},
            ProjectType.INDUSTRIAL: {'base': 180, 'per_1000sf': 0.3},
            ProjectType.RETAIL: {'base': 200, 'per_1000sf': 0.35},
            ProjectType.HEALTHCARE: {'base': 400, 'per_1000sf': 0.6},
            ProjectType.EDUCATION: {'base': 320, 'per_1000sf': 0.45}
        }

    def add_training_project(self, project: ProjectFeatures):
        """Add project to training dataset."""
        if project.actual_duration is not None:
            self.training_data.append(project)

    def load_training_data(self, df: pd.DataFrame):
        """Load training data from DataFrame."""

        for _, row in df.iterrows():
            project = ProjectFeatures(
                project_id=str(row['project_id']),
                project_type=ProjectType(row['project_type'].lower()),
                size_sf=float(row['size_sf']),
                floors=int(row['floors']),
                complexity=int(row['complexity']),
                location_factor=float(row.get('location_factor', 1.0)),
                has_basement=bool(row.get('has_basement', False)),
                is_renovation=bool(row.get('is_renovation', False)),
                actual_duration=int(row['actual_duration'])
            )
            self.add_training_project(project)

    def _extract_features(self, project: ProjectFeatures) -> np.ndarray:
        """Extract feature vector from project."""

        return np.array([
            project.size_sf / 10000,  # Normalize to 10k SF
            project.floors,
            project.complexity,
            project.location_factor,
            1 if project.has_basement else 0,
            1 if project.is_renovation else 0
        ])

    def _calculate_distance(self, features1: np.ndarray,
                            features2: np.ndarray) -> float:
        """Calculate weighted Euclidean distance."""

        weights = np.array(list(self.feature_weights.values()))
        diff = (features1 - features2) ** 2
        weighted_diff = diff * weights
        return math.sqrt(np.sum(weighted_diff))

    def _find_k_nearest(self, target: ProjectFeatures, k: int = 5,
                        same_type: bool = True) -> List[Tuple[ProjectFeatures, float]]:
        """Find k nearest neighbors."""

        target_features = self._extract_features(target)
        distances = []

        for project in self.training_data:
            if same_type and project.project_type != target.project_type:
                continue

            proj_features = self._extract_features(project)
            distance = self._calculate_distance(target_features, proj_features)
            distances.append((project, distance))

        distances.sort(key=lambda x: x[1])
        return distances[:k]

    def predict_knn(self, target: ProjectFeatures, k: int = 5) -> PredictionResult:
        """Predict duration using k-NN."""

        nearest = self._find_k_nearest(target, k)

        if not nearest:
            # Fall back to baseline
            return self._predict_baseline(target)

        # Simple average of k nearest
        durations = [p.actual_duration for p, _ in nearest]
        predicted = int(np.mean(durations))

        # Confidence interval (using std dev)
        std = np.std(durations)
        lower = int(predicted - 1.96 * std)
        upper = int(predicted + 1.96 * std)

        return PredictionResult(
            predicted_duration=predicted,
            confidence_interval=(max(1, lower), upper),
            similar_projects=[p.project_id for p, _ in nearest],
            model_used=ModelType.KNN,
            features_importance=self.feature_weights
        )

    def predict_weighted_knn(self, target: ProjectFeatures, k: int = 5) -> PredictionResult:
        """Predict duration using distance-weighted k-NN."""

        nearest = self._find_k_nearest(target, k)

        if not nearest:
            return self._predict_baseline(target)

        # Inverse distance weighting
        total_weight = 0
        weighted_sum = 0

        for project, distance in nearest:
            weight = 1 / (distance + 0.001)  # Add small value to avoid division by zero
            weighted_sum += project.actual_duration * weight
            total_weight += weight

        predicted = int(weighted_sum / total_weight)

        # Confidence interval
        durations = [p.actual_duration for p, _ in nearest]
        std = np.std(durations)
        lower = int(predicted - 1.96 * std)
        upper = int(predicted + 1.96 * std)

        return PredictionResult(
            predicted_duration=predicted,
            confidence_interval=(max(1, lower), upper),
            similar_projects=[p.project_id for p, _ in nearest],
            model_used=ModelType.WEIGHTED_KNN,
            features_importance=self.feature_weights
        )

    def predict_regression(self, target: ProjectFeatures) -> PredictionResult:
        """Predict duration using linear regression."""

        if len(self.training_data) < 3:
            return self._predict_baseline(target)

        # Filter by project type
        same_type = [p for p in self.training_data if p.project_type == target.project_type]

        if len(same_type) < 3:
            same_type = self.training_data

        # Build feature matrix and target vector
        X = np.array([self._extract_features(p) for p in same_type])
        y = np.array([p.actual_duration for p in same_type])

        # Simple linear regression using normal equations
        X_with_intercept = np.column_stack([np.ones(len(X)), X])

        try:
            # beta = (X'X)^-1 X'y
            XtX = X_with_intercept.T @ X_with_intercept
            XtX_inv = np.linalg.inv(XtX)
            beta = XtX_inv @ X_with_intercept.T @ y
        except np.linalg.LinAlgError:
            return self._predict_baseline(target)

        # Predict
        target_features = self._extract_features(target)
        target_with_intercept = np.array([1] + list(target_features))
        predicted = int(target_features @ beta[1:] + beta[0])

        # Calculate residuals for confidence interval
        y_pred = X_with_intercept @ beta
        residuals = y - y_pred
        rmse = math.sqrt(np.mean(residuals ** 2))

        return PredictionResult(
            predicted_duration=max(1, predicted),
            confidence_interval=(max(1, int(predicted - 1.96 * rmse)),
                               int(predicted + 1.96 * rmse)),
            similar_projects=[p.project_id for p in same_type[:5]],
            model_used=ModelType.LINEAR_REGRESSION,
            features_importance=dict(zip(self.feature_weights.keys(),
                                        [abs(b) / sum(abs(beta[1:])) for b in beta[1:]]))
        )

    def _predict_baseline(self, target: ProjectFeatures) -> PredictionResult:
        """Fall back to baseline prediction."""

        baseline = self.type_baseline_days.get(target.project_type,
                                                {'base': 250, 'per_1000sf': 0.4})

        predicted = int(baseline['base'] +
                       (target.size_sf / 1000) * baseline['per_1000sf'] * 30)

        # Adjustments
        if target.complexity > 3:
            predicted = int(predicted * (1 + (target.complexity - 3) * 0.1))
        if target.has_basement:
            predicted = int(predicted * 1.1)
        if target.is_renovation:
            predicted = int(predicted * 1.2)

        predicted = int(predicted * target.location_factor)

        return PredictionResult(
            predicted_duration=predicted,
            confidence_interval=(int(predicted * 0.8), int(predicted * 1.2)),
            similar_projects=[],
            model_used=ModelType.LINEAR_REGRESSION,
            features_importance=self.feature_weights
        )

    def predict(self, target: ProjectFeatures,
                model: ModelType = ModelType.WEIGHTED_KNN,
                k: int = 5) -> PredictionResult:
        """Predict duration using specified model."""

        if model == ModelType.KNN:
            return self.predict_knn(target, k)
        elif model == ModelType.WEIGHTED_KNN:
            return self.predict_weighted_knn(target, k)
        elif model == ModelType.LINEAR_REGRESSION:
            return self.predict_regression(target)

        return self._predict_baseline(target)

    def evaluate_model(self, test_data: List[ProjectFeatures],
                       model: ModelType = ModelType.WEIGHTED_KNN) -> Dict[str, float]:
        """Evaluate model performance."""

        actuals = []
        predictions = []

        for project in test_data:
            if project.actual_duration is None:
                continue

            result = self.predict(project, model)
            actuals.append(project.actual_duration)
            predictions.append(result.predicted_duration)

        if not actuals:
            return {}

        actuals = np.array(actuals)
        predictions = np.array(predictions)

        mae = np.mean(np.abs(actuals - predictions))
        mape = np.mean(np.abs((actuals - predictions) / actuals)) * 100
        rmse = math.sqrt(np.mean((actuals - predictions) ** 2))

        return {
            'mae': round(mae, 1),
            'mape': round(mape, 1),
            'rmse': round(rmse, 1),
            'samples': len(actuals)
        }

    def get_similar_projects(self, target: ProjectFeatures, n: int = 10) -> pd.DataFrame:
        """Get most similar projects."""

        nearest = self._find_k_nearest(target, k=n, same_type=False)

        data = [{
            'Project ID': p.project_id,
            'Type': p.project_type.value,
            'Size (SF)': p.size_sf,
            'Floors': p.floors,
            'Complexity': p.complexity,
            'Duration (days)': p.actual_duration,
            'Distance': round(d, 3)
        } for p, d in nearest]

        return pd.DataFrame(data)

Quick Start

# Create predictor
predictor = DurationPredictor()

# Add training data
training_projects = [
    ProjectFeatures("P001", ProjectType.OFFICE, 50000, 10, 3, 1.0, True, False, 365),
    ProjectFeatures("P002", ProjectType.OFFICE, 75000, 15, 4, 1.1, True, False, 450),
    ProjectFeatures("P003", ProjectType.OFFICE, 30000, 5, 2, 0.9, False, False, 280),
    ProjectFeatures("P004", ProjectType.OFFICE, 60000, 12, 3, 1.0, True, False, 390),
]

for p in training_projects:
    predictor.add_training_project(p)

# Predict for new project
new_project = ProjectFeatures(
    project_id="NEW-001",
    project_type=ProjectType.OFFICE,
    size_sf=55000,
    floors=11,
    complexity=3,
    location_factor=1.0,
    has_basement=True,
    is_renovation=False
)

result = predictor.predict(new_project, ModelType.WEIGHTED_KNN)
print(f"Predicted duration: {result.predicted_duration} days")
print(f"Confidence interval: {result.confidence_interval}")
print(f"Similar projects: {result.similar_projects}")

Common Use Cases

1. Compare Models

knn_result = predictor.predict(new_project, ModelType.KNN)
weighted_result = predictor.predict(new_project, ModelType.WEIGHTED_KNN)
regression_result = predictor.predict(new_project, ModelType.LINEAR_REGRESSION)

print(f"k-NN: {knn_result.predicted_duration}")
print(f"Weighted k-NN: {weighted_result.predicted_duration}")
print(f"Regression: {regression_result.predicted_duration}")

2. Model Evaluation

metrics = predictor.evaluate_model(test_data, ModelType.WEIGHTED_KNN)
print(f"MAE: {metrics['mae']} days")
print(f"MAPE: {metrics['mape']}%")

3. Find Similar Projects

similar = predictor.get_similar_projects(new_project, n=5)
print(similar)

Resources

DDC Book: Chapter 4.5 - Future: Predictions and Machine Learning
scikit-learn: https://scikit-learn.org/
Website: https://datadrivenconstruction.io