In [12]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
from notebookutils import mssparkutils

def train_and_deploy_model():
    """
    Train and deploy a model with comprehensive NaN handling in Microsoft Fabric
    """
    try:
        # 1. Load data from Lakehouse with validation
        print("üîç Loading data from Lakehouse...")
        df = pd.read_csv("/lakehouse/default/Files/PDC_biospecimen_manifest_03272025_214257.csv")
        print(f"‚úÖ Data loaded successfully. Shape: {df.shape}")
        
        # 2. Data validation
        print("\nüîé Validating data...")
        feature_cols = [
            'Aliquot Quantity', 
            'Aliquot Volume',
            'Concentration',
            'Days To Collection',
            'Days To Sample Procurement',
            'Current Weight',
            'Initial Weight'
        ]
        feature_cols = [col for col in feature_cols if col in df.columns]
        
        if not feature_cols:
            raise ValueError("‚ùå No valid feature columns found")
        
        target_col = 'Case Status'
        if target_col not in df.columns:
            raise ValueError(f"‚ùå Target column '{target_col}' not found")
        
        # 3. NaN handling and preprocessing
        print("\nüßπ Handling missing values...")
        
        # Show missing values before imputation
        print("\nMissing values per feature column:")
        print(df[feature_cols].isnull().sum())
        
        # Encode target (no NaN handling needed as it's categorical)
        le = LabelEncoder()
        y = le.fit_transform(df[target_col].astype(str))
        
        # Create preprocessing pipeline
        preprocessor = make_pipeline(
            SimpleImputer(strategy='median'),  # Handles NaN values
            StandardScaler()                   # Scales features
        )
        
        # Apply preprocessing
        X = preprocessor.fit_transform(df[feature_cols])
        
        # Verify no NaN values remain
        if np.isnan(X).any():
            raise ValueError("‚ùå NaN values still present after preprocessing")
        
        # 4. Train model with MLflow tracking
        print("\nü§ñ Training model...")
        with mlflow.start_run():
            # Create and train model
            model = RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                n_jobs=-1,
                class_weight='balanced'
            )
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)
            
            model.fit(X_train, y_train)
            
            # Log important information
            mlflow.log_param("n_estimators", 100)
            mlflow.log_metric("train_accuracy", model.score(X_train, y_train))
            mlflow.log_metric("test_accuracy", model.score(X_test, y_test))
            
            # Log the preprocessing pipeline
            mlflow.sklearn.log_model(preprocessor, "preprocessor")
            
            # Log the trained model
            mlflow.sklearn.log_model(model, "model")
            
            # Save feature columns as artifact
            with open("feature_columns.txt", "w") as f:
                f.write("\n".join(feature_cols))
            mlflow.log_artifact("feature_columns.txt")
            
            # Register model
            mlflow.register_model(
                "runs:/{}/model".format(mlflow.active_run().info.run_id),
                "BiospecimenClassifier"
            )
            
            print("‚úÖ Model trained and registered successfully!")
            
            # Return run ID for reference
            return mlflow.active_run().info.run_id
            
    except Exception as e:
        print(f"\n‚ùå Error in model deployment: {str(e)}")
        print("\nüõ†Ô∏è Troubleshooting steps:")
        print("1. Check for missing values in your data")
        print("2. Verify all feature columns exist")
        print("3. Ensure target column has valid values")
        raise

if __name__ == "__main__":
    train_and_deploy_model()

StatementMeta(, e50020a4-5e38-4ad5-9b81-a3040d5ab3df, 15, Finished, Available, Finished)

üîç Loading data from Lakehouse...
‚úÖ Data loaded successfully. Shape: (452, 45)

üîé Validating data...

üßπ Handling missing values...

Missing values per feature column:
Aliquot Quantity              452
Aliquot Volume                452
Concentration                 452
Days To Collection            347
Days To Sample Procurement    452
Current Weight                452
Initial Weight                347
dtype: int64



ü§ñ Training model...
‚úÖ Model trained and registered successfully!


2025-04-01:11:48:01,946 ERROR    [shared_platform_utils.py:82] Create MLModel failed, status_code: 400, b'{"requestId":"6f84f5ef-22d8-443f-b2b2-f98075bc112b","errorCode":"ItemDisplayNameAlreadyInUse","message":"Requested \'BiospecimenClassifier\' is already in use"}'
Registered model 'BiospecimenClassifier' already exists. Creating a new version of this model...


In [13]:
import mlflow
import pandas as pd
import numpy as np
import json
from notebookutils import mssparkutils

def load_model_artifacts():
    """
    Load model and preprocessing pipeline from MLflow
    """
    try:
        # Load the entire run artifacts
        client = mlflow.tracking.MlflowClient()
        latest_version = client.get_latest_versions("BiospecimenClassifier")[0]
        run_id = latest_version.run_id
        
        # Load model and preprocessor separately
        model_uri = f"runs:/{run_id}/model"
        preprocessor_uri = f"runs:/{run_id}/preprocessor"
        
        model = mlflow.sklearn.load_model(model_uri)
        preprocessor = mlflow.sklearn.load_model(preprocessor_uri)
        
        return model, preprocessor, latest_version.version
    
    except Exception as e:
        print(f"Error loading model artifacts: {str(e)}")
        raise

# Load model components
model, preprocessor, model_version = load_model_artifacts()

def validate_input(data):
    """Validate input data structure and values"""
    required_columns = [
        'Aliquot Quantity', 
        'Aliquot Volume',
        'Concentration',
        'Days To Collection',
        'Days To Sample Procurement',
        'Current Weight',
        'Initial Weight'
    ]
    
    # Check all required columns are present
    missing_cols = [col for col in required_columns if col not in data]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Convert to DataFrame for processing
    input_df = pd.DataFrame([data])
    
    # Check for non-numeric values
    for col in required_columns:
        if not pd.api.types.is_numeric_dtype(input_df[col]):
            try:
                input_df[col] = pd.to_numeric(input_df[col])
            except:
                raise ValueError(f"Column {col} contains non-numeric value: {input_df[col].values[0]}")
    
    return input_df[required_columns]

def score(data):
    """
    Score new data using the deployed model
    Args:
        data: Dictionary with feature values
    Returns:
        Dictionary with predictions and metadata
    """
    try:
        # Validate and prepare input
        input_data = validate_input(data)
        
        # Preprocess (handles NaN if any remain)
        processed_data = preprocessor.transform(input_data)
        
        # Verify no NaN values
        if np.isnan(processed_data).any():
            raise ValueError("NaN values present after preprocessing")
        
        # Get predictions
        prediction = int(model.predict(processed_data)[0])
        probabilities = model.predict_proba(processed_data)[0].tolist()
        
        return {
            "prediction": prediction,
            "probabilities": probabilities,
            "status": "success",
            "model_version": model_version
        }
        
    except Exception as e:
        return {
            "error": str(e),
            "status": "error",
            "input_received": data
        }

# Example test
if __name__ == "__main__":
    test_data = {
        "Aliquot Quantity": 5.2,
        "Aliquot Volume": 1.0,
        "Concentration": 50.0,
        "Days To Collection": 10,
        "Days To Sample Procurement": 12,
        "Current Weight": 0.5,
        "Initial Weight": 0.6
    }
    
    result = score(test_data)
    print("Test scoring result:")
    print(json.dumps(result, indent=2))
    

StatementMeta(, e50020a4-5e38-4ad5-9b81-a3040d5ab3df, 16, Finished, Available, Finished)

  latest_version = client.get_latest_versions("BiospecimenClassifier")[0]


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Test scoring result:
{
  "prediction": 1,
  "probabilities": [
    0.0,
    1.0
  ],
  "status": "success",
  "model_version": "2"
}


StatementMeta(, e50020a4-5e38-4ad5-9b81-a3040d5ab3df, 17, Finished, Available, Finished)