Repository: https://github.com/jackccrawford/gpu-sentinel-pro

HEAD commit: 66d8e95994bc324c848385f5eab6df01a48eb886

Total files: 127 · Rendered: 93 · Skipped: 34

View:

Directory tree

repo
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── bug_report.md
│   │   └── feature_request.md
│   ├── workflows
│   │   ├── ci.yml
│   │   ├── codeql.yml
│   │   ├── snyk.yml
│   │   └── sonarcloud.yml
│   ├── dependabot.yml
│   └── pull_request_template.md
├── backend
│   ├── migrations
│   │   ├── 001_create_gpu_metrics_table.sql
│   │   └── 002_create_alerts_table.sql
│   ├── src
│   │   ├── database
│   │   │   ├── __init__.py
│   │   │   ├── client.py
│   │   │   ├── config.py
│   │   │   └── test_connection.py
│   │   ├── models
│   │   │   ├── __init__.py
│   │   │   └── gpu_metrics.py
│   │   ├── service
│   │   │   ├── __init__.py
│   │   │   ├── alert_manager.py
│   │   │   ├── alerts.py
│   │   │   ├── analytics_service.py
│   │   │   ├── app.py
│   │   │   ├── config.py
│   │   │   ├── config.yaml
│   │   │   ├── gpu_service.log
│   │   │   ├── logging_manager.py
│   │   │   ├── run_service.sh
│   │   │   ├── service.pid
│   │   │   ├── settings.py
│   │   │   ├── stop_service.sh
│   │   │   ├── system_health.py
│   │   │   ├── test_alerts.py
│   │   │   ├── test_db.py
│   │   │   └── test_settings.py
│   │   └── __init__.py
│   ├── README.md
│   └── requirements.txt
├── docs
│   ├── architecture
│   │   └── ARCHITECTURE.md
│   ├── requirements
│   │   ├── DEVELOPMENT_GUIDE.md
│   │   ├── REQUIREMENTS.md
│   │   └── TECHNICAL_SPEC.md
│   ├── API.md
│   ├── INSTALLATION.md
│   └── README.md
├── frontend
│   ├── public
│   │   └── vite.svg
│   ├── src
│   │   ├── assets
│   │   │   └── react.svg
│   │   ├── components
│   │   │   ├── AlertsPanel.tsx
│   │   │   ├── MetricsGrid.tsx
│   │   │   └── TimeSeriesChart.tsx
│   │   ├── App.tsx
│   │   ├── index.css
│   │   ├── main.tsx
│   │   └── vite-env.d.ts
│   ├── Dockerfile
│   ├── eslint.config.js
│   ├── frontend.log
│   ├── frontend.pid
│   ├── index.html
│   ├── package-lock.json
│   ├── package.json
│   ├── README.md
│   ├── run_frontend.sh
│   ├── stop_frontend.sh
│   ├── tsconfig.app.json
│   ├── tsconfig.json
│   ├── tsconfig.node.json
│   └── vite.config.ts
├── images
│   ├── DarkMode-Stressed.png
│   ├── gpu-burn-danger-zone.png
│   ├── nvidia-smi.png
│   └── Ollama-Mistral-Small.png
├── src
│   ├── collector
│   │   ├── __init__.py
│   │   └── collector.py
│   └── __init__.py
├── supabase
│   ├── init
│   │   └── 00-init.sql
│   ├── .env.supabase
│   ├── config.toml
│   ├── docker-compose.simple.yml
│   ├── docker-compose.yml
│   ├── docker-compose.yml.bak
│   ├── kong.yml
│   ├── seed.sql
│   ├── start.sh
│   └── stop.sh
├── .dockerignore
├── .editorconfig
├── .env.example
├── .env.template
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── docker-compose.yml
├── Dockerfile
├── frontend.pid
├── LICENSE
├── README.md
├── requirements.txt
├── run.sh
├── SECURITY.md
├── sonar-project.properties
└── TODO.md

Table of contents (93)

.dockerignore (254 B)
.editorconfig (705 B)
.env.example (74 B)
.env.template (196 B)
.github/ISSUE_TEMPLATE/bug_report.md (729 B)
.github/ISSUE_TEMPLATE/feature_request.md (757 B)
.github/dependabot.yml (899 B)
.github/pull_request_template.md (971 B)
.github/workflows/ci.yml (1.5 KiB)
.github/workflows/codeql.yml (838 B)
.github/workflows/snyk.yml (1.0 KiB)
.github/workflows/sonarcloud.yml (1.6 KiB)
.gitignore (511 B)
CHANGELOG.md (1.5 KiB)
CONTRIBUTING.md (3.7 KiB)
Dockerfile (396 B)
LICENSE (1.0 KiB)
README.md (5.0 KiB)
SECURITY.md (2.6 KiB)
TODO.md (2.8 KiB)
backend/README.md (523 B)
backend/migrations/001_create_gpu_metrics_table.sql (1.1 KiB)
backend/migrations/002_create_alerts_table.sql (3.5 KiB)
backend/requirements.txt (280 B)
backend/src/__init__.py (0 B)
backend/src/database/__init__.py (0 B)
backend/src/database/client.py (2.8 KiB)
backend/src/database/config.py (267 B)
backend/src/database/test_connection.py (1.2 KiB)
backend/src/models/__init__.py (0 B)
backend/src/models/gpu_metrics.py (786 B)
backend/src/service/__init__.py (0 B)
backend/src/service/alert_manager.py (6.2 KiB)
backend/src/service/alerts.py (6.6 KiB)
backend/src/service/analytics_service.py (8.2 KiB)
backend/src/service/app.py (10.8 KiB)
backend/src/service/config.py (868 B)
backend/src/service/config.yaml (655 B)
backend/src/service/gpu_service.log (238 B)
backend/src/service/logging_manager.py (6.7 KiB)
backend/src/service/run_service.sh (433 B)
backend/src/service/service.pid (7 B)
backend/src/service/settings.py (2.0 KiB)
backend/src/service/stop_service.sh (414 B)
backend/src/service/system_health.py (10.2 KiB)
backend/src/service/test_alerts.py (1.8 KiB)
backend/src/service/test_db.py (2.0 KiB)
backend/src/service/test_settings.py (627 B)
docker-compose.yml (891 B)
docs/API.md (3.6 KiB)
docs/INSTALLATION.md (3.5 KiB)
docs/README.md (2.3 KiB)
docs/architecture/ARCHITECTURE.md (5.9 KiB)
docs/requirements/DEVELOPMENT_GUIDE.md (6.5 KiB)
docs/requirements/REQUIREMENTS.md (6.1 KiB)
docs/requirements/TECHNICAL_SPEC.md (7.8 KiB)
frontend/Dockerfile (324 B)
frontend/README.md (557 B)
frontend/eslint.config.js (734 B)
frontend/frontend.log (158 B)
frontend/frontend.pid (7 B)
frontend/index.html (366 B)
frontend/package.json (1.1 KiB)
frontend/run_frontend.sh (811 B)
frontend/src/App.tsx (32.1 KiB)
frontend/src/components/AlertsPanel.tsx (3.3 KiB)
frontend/src/components/MetricsGrid.tsx (4.9 KiB)
frontend/src/components/TimeSeriesChart.tsx (1.8 KiB)
frontend/src/index.css (599 B)
frontend/src/main.tsx (210 B)
frontend/src/vite-env.d.ts (38 B)
frontend/stop_frontend.sh (404 B)
frontend/tsconfig.app.json (813 B)
frontend/tsconfig.json (679 B)
frontend/tsconfig.node.json (330 B)
frontend/vite.config.ts (196 B)
frontend.pid (7 B)
requirements.txt (98 B)
run.sh (1.5 KiB)
sonar-project.properties (761 B)
src/__init__.py (0 B)
src/collector/__init__.py (0 B)
src/collector/collector.py (1.8 KiB)
supabase/.env.supabase (143 B)
supabase/config.toml (5.9 KiB)
supabase/docker-compose.simple.yml (412 B)
supabase/docker-compose.yml (2.2 KiB)
supabase/docker-compose.yml.bak (2.4 KiB)
supabase/init/00-init.sql (1.3 KiB)
supabase/kong.yml (499 B)
supabase/seed.sql (0 B)
supabase/start.sh (1.9 KiB)
supabase/stop.sh (350 B)

Skipped items

Skipped binaries (2)

frontend/public/vite.svg (1.5 KiB)
frontend/src/assets/react.svg (4.0 KiB)

Skipped large files (5)

frontend/package-lock.json (158.5 KiB)
images/DarkMode-Stressed.png (71.1 KiB)
images/Ollama-Mistral-Small.png (64.0 KiB)
images/gpu-burn-danger-zone.png (75.4 KiB)
images/nvidia-smi.png (163.3 KiB)

.dockerignore (254 B)

# Version control
.git
.gitignore

# Python
__pycache__
*.pyc
*.pyo
*.pyd
.Python
venv/
.env

# Node
node_modules/
npm-debug.log
yarn-debug.log
yarn-error.log

# Build
dist/
build/
*.egg-info/

# IDE
.idea/
.vscode/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

↑ Back to top

.editorconfig (705 B)

# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
charset = utf-8
trim_trailing_whitespace = true

# Python files
[*.py]
indent_style = space
indent_size = 4
max_line_length = 100

# JavaScript/TypeScript files
[*.{js,jsx,ts,tsx}]
indent_style = space
indent_size = 2
max_line_length = 100

# YAML files
[*.{yml,yaml}]
indent_style = space
indent_size = 2

# Markdown files
[*.md]
trim_trailing_whitespace = false
max_line_length = off

# JSON files
[*.json]
indent_style = space
indent_size = 2

# Shell scripts
[*.sh]
indent_style = space
indent_size = 2

↑ Back to top

.env.example (74 B)

SUPABASE_URL=your_supabase_project_url
SUPABASE_KEY=your_supabase_anon_key

↑ Back to top

.env.template (196 B)

# Supabase Configuration
SUPABASE_URL=your-project-url-here
SUPABASE_KEY=your-anon-key-here

# API Configuration
API_URL=http://localhost:5000/api/gpu-stats
COLLECTION_INTERVAL=0.25  # in seconds

↑ Back to top

.github/ISSUE_TEMPLATE/bug_report.md (729 B)

name: Bug Report about: Create a report to help us improve title: '[BUG] ' labels: 'bug' assignees: ''

Describe the Bug A clear and concise description of what the bug is.

To Reproduce Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. See error

Expected Behavior A clear description of what you expected to happen.

Screenshots If applicable, add screenshots to help explain your problem.

Environment: - OS: [e.g., Ubuntu 22.04] - GPU: [e.g., NVIDIA RTX 3080] - Driver Version: [e.g., 535.183.01] - Browser: [e.g., Chrome 120] - GPU Sentinel Version: [e.g., 1.0.0]

Additional Context - Backend logs (if applicable) - Frontend logs (if applicable) - Any error messages

↑ Back to top

.github/ISSUE_TEMPLATE/feature_request.md (757 B)

name: Feature Request about: Suggest an idea for GPU Sentinel Pro title: '[FEATURE] ' labels: 'enhancement' assignees: ''

Is your feature request related to a problem? A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

Describe the solution you'd like A clear and concise description of what you want to happen.

Describe alternatives you've considered A clear and concise description of any alternative solutions or features you've considered.

Use Case Describe how you would use this feature in your workflow.

Additional Context - Expected impact on performance - Related features or dependencies - Screenshots or mockups if applicable - Any other context about the feature request

↑ Back to top

.github/dependabot.yml (899 B)

version: 2
updates:
  # Frontend dependencies
  - package-ecosystem: "npm"
    directory: "/frontend"
    schedule:
      interval: "weekly"
    open-pull-requests-limit: 10
    labels:
      - "npm"
      - "dependencies"
    commit-message:
      prefix: "chore"
      prefix-development: "chore"
      include: "scope"

  # Backend dependencies
  - package-ecosystem: "pip"
    directory: "/backend"
    schedule:
      interval: "weekly"
    open-pull-requests-limit: 10
    labels:
      - "pip"
      - "dependencies"
    commit-message:
      prefix: "chore"
      prefix-development: "chore"
      include: "scope"

  # GitHub Actions
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
    labels:
      - "github-actions"
      - "dependencies"
    commit-message:
      prefix: "chore"
      prefix-development: "chore"
      include: "scope"

↑ Back to top

.github/pull_request_template.md (971 B)

Description

Brief description of the changes

Type of Change

[ ] Bug fix (non-breaking change that fixes an issue)
[ ] New feature (non-breaking change that adds functionality)
[ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
[ ] Documentation update
[ ] Performance improvement
[ ] Code cleanup or refactor

Testing

[ ] Unit tests added/updated
[ ] Manual testing performed
[ ] All tests passing

Screenshots

If applicable, add screenshots to help explain your changes.

Checklist

[ ] My code follows the project's style guidelines
[ ] I have performed a self-review of my code
[ ] I have commented my code, particularly in hard-to-understand areas
[ ] I have made corresponding changes to the documentation
[ ] My changes generate no new warnings
[ ] I have updated the CHANGELOG.md file

Additional Notes

Any additional information that would be helpful for reviewers.

↑ Back to top

.github/workflows/ci.yml (1.5 KiB)

name: CI

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

jobs:
  backend-tests:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.11"]

    steps:
    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
    
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r backend/requirements.txt
        pip install pytest pytest-cov pylint black

    - name: Check formatting
      run: |
        black --check backend/

    - name: Lint with pylint
      run: |
        pylint backend/src/

    - name: Run tests with coverage
      run: |
        pytest backend/tests/ --cov=backend/src/ --cov-report=xml

  frontend-tests:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        node-version: [18.x, 20.x]

    steps:
    - uses: actions/checkout@v4
    - name: Use Node.js ${{ matrix.node-version }}
      uses: actions/setup-node@v3
      with:
        node-version: ${{ matrix.node-version }}
        
    - name: Install dependencies
      run: |
        cd frontend
        npm ci

    - name: Check formatting
      run: |
        cd frontend
        npm run format:check

    - name: Lint
      run: |
        cd frontend
        npm run lint

    - name: Type check
      run: |
        cd frontend
        npm run typecheck

    - name: Build
      run: |
        cd frontend
        npm run build

↑ Back to top

.github/workflows/codeql.yml (838 B)

name: "CodeQL"

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]
  schedule:
    - cron: '30 1 * * 0'  # Run at 1:30 UTC every Sunday

jobs:
  analyze:
    name: Analyze
    runs-on: ubuntu-latest
    permissions:
      actions: read
      contents: read
      security-events: write

    strategy:
      fail-fast: false
      matrix:
        language: [ 'python', 'javascript', 'typescript' ]

    steps:
    - name: Checkout repository
      uses: actions/checkout@v4

    - name: Initialize CodeQL
      uses: github/codeql-action/init@v2
      with:
        languages: ${{ matrix.language }}

    - name: Autobuild
      uses: github/codeql-action/autobuild@v2

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v2
      with:
        category: "/language:${{matrix.language}}"

↑ Back to top

.github/workflows/snyk.yml (1.0 KiB)

name: Snyk Security Scan

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
  schedule:
    - cron: '0 2 * * 0'  # Run at 2:00 UTC every Sunday

jobs:
  security:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.11'

      - name: Set up Node.js
        uses: actions/setup-node@v3
        with:
          node-version: '20'

      - name: Install dependencies
        run: |
          python -m pip install -r backend/requirements.txt
          cd frontend && npm install

      - name: Run Snyk to check for vulnerabilities
        uses: snyk/actions/python@master
        env:
          SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
        with:
          command: monitor

      - name: Run Snyk on frontend
        uses: snyk/actions/node@master
        env:
          SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
        with:
          command: monitor
          args: --all-projects

↑ Back to top

.github/workflows/sonarcloud.yml (1.6 KiB)

name: SonarCloud Analysis

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

jobs:
  sonarcloud:
    name: SonarCloud
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.11'

      - name: Set up Node.js
        uses: actions/setup-node@v3
        with:
          node-version: '20'

      - name: Install dependencies
        run: |
          python -m pip install -r backend/requirements.txt
          python -m pip install coverage pytest
          cd frontend && npm install

      - name: Run backend tests with coverage
        run: |
          cd backend
          coverage run -m pytest
          coverage xml -o coverage-reports/coverage.xml

      - name: Run frontend tests with coverage
        run: |
          cd frontend
          npm run test:coverage

      - name: SonarCloud Scan
        uses: SonarSource/sonarcloud-github-action@master
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
        with:
          args: >
            -Dsonar.organization=jackccrawford
            -Dsonar.projectKey=jackccrawford_gpu-sentinel-pro
            -Dsonar.python.coverage.reportPaths=backend/coverage-reports/coverage.xml
            -Dsonar.javascript.lcov.reportPaths=frontend/coverage/lcov.info
            -Dsonar.sources=backend/src,frontend/src
            -Dsonar.tests=backend/tests,frontend/src/**/*.test.tsx

↑ Back to top

.gitignore (511 B)

# Dependencies
node_modules/
**/node_modules/**
venv/
__pycache__/

# Environment files
.env
*.env

# Logs
*.log
frontend/frontend.log
backend/src/service/gpu_service.log

# Service PIDs
*.pid
frontend/frontend.pid
backend/src/service/service.pid

# Build files
dist/
build/
*.pyc

# IDE files
.vscode/
.idea/
*.swp
*.swo

# OS files
.DS_Store
Thumbs.db

# Supabase local dev
supabase/volumes/

# TypeScript
*.tsbuildinfo
frontend/.vite/
.vite/
dist-ssr/

# Package files
package-lock.json
**/package-lock.json

↑ Back to top

CHANGELOG.md (1.5 KiB)

Changelog

All notable changes to GPU Sentinel Pro will be documented in this file.

The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.

Unreleased

Added

Initial project structure
Basic GPU metrics monitoring
Real-time dashboard with React frontend
FastAPI backend service
Supabase integration for data storage
Temperature and performance monitoring
Dark/light mode support
Multi-GPU support
Basic alerting system

Changed

None

Deprecated

None

Removed

None

Fixed

None

Security

Basic CORS configuration
Input validation
SQL injection protection

1.0.0 - 2024-02-20

Added

Initial release
Core monitoring functionality
Basic dashboard
Real-time updates
Database integration

Types of Changes

Added for new features
Changed for changes in existing functionality
Deprecated for soon-to-be removed features
Removed for now removed features
Fixed for any bug fixes
Security in case of vulnerabilities

Versioning

Major version (X.0.0) - Incompatible API changes
Minor version (0.X.0) - Added functionality in a backward compatible manner
Patch version (0.0.X) - Backward compatible bug fixes

↑ Back to top

CONTRIBUTING.md (3.7 KiB)

Contributing to GPU Sentinel Pro

Thank you for your interest in contributing to GPU Sentinel Pro! This document provides guidelines and workflows for contributing.

Code of Conduct

Be respectful and inclusive
Provide constructive feedback
Focus on the problem, not the person
Help others learn and grow

Getting Started

Fork the repository
Clone your fork: bash git clone https://github.com/YOUR_USERNAME/gpu-sentinel-pro.git
Add upstream remote: bash git remote add upstream https://github.com/jackccrawford/gpu-sentinel-pro.git
Create a feature branch: bash git checkout -b feature/your-feature-name

Development Setup

Backend Development

cd backend
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt

Frontend Development

cd frontend
npm install

Database Setup

cd supabase
docker-compose up -d

Development Workflow

Check TODO.md for planned features
Create an issue for new features/bugs
Write code and tests
Update documentation
Submit pull request

Commit Messages

Follow conventional commits: - feat: New features - fix: Bug fixes - docs: Documentation changes - style: Code style changes - refactor: Code refactoring - test: Test updates - chore: Maintenance tasks

Example:

git commit -m "feat: add temperature trend analysis"

Pull Request Process

Update documentation
Add/update tests
Ensure all tests pass
Update CHANGELOG.md
Request review

Code Style

Python (Backend)

Follow PEP 8
Use type hints
Document functions and classes
Maximum line length: 100 characters

Example:

def calculate_temperature_trend(
    temperatures: List[float],
    window_size: int = 10
) -> float:
    """
    Calculate temperature trend over time window.

    Args:
        temperatures: List of temperature readings
        window_size: Size of rolling window

    Returns:
        float: Temperature change rate
    """
    # Implementation

TypeScript (Frontend)

Use ESLint configuration
Document components and functions
Use functional components
Type all props and state

Example:

interface TemperatureGraphProps {
  data: Temperature[];
  timeRange: TimeRange;
  onRangeChange: (range: TimeRange) => void;
}

const TemperatureGraph: React.FC<TemperatureGraphProps> = ({
  data,
  timeRange,
  onRangeChange,
}) => {
  // Implementation
};

Testing

Backend Tests

cd backend
pytest

Frontend Tests

cd frontend
npm test

Documentation

Update API.md for endpoint changes
Update INSTALLATION.md for setup changes
Add JSDoc comments for frontend components
Add docstrings for Python functions

Feature Requests

Check existing issues and TODO.md
Create detailed issue with:
Use case
Expected behavior
Technical approach
Acceptance criteria

Bug Reports

Include: 1. Description 2. Steps to reproduce 3. Expected behavior 4. Actual behavior 5. System information: - OS version - GPU model - Driver version - Software version

Review Process

Code review by maintainers
CI/CD checks
Documentation review
Testing verification
Final approval

Release Process

Version bump
Update CHANGELOG.md
Create release branch
Run test suite
Create GitHub release
Deploy to production

Getting Help

Check documentation
Search existing issues
Join discussions
Ask questions in issues

Recognition

Contributors will be: - Listed in CONTRIBUTORS.md - Mentioned in release notes - Credited in documentation

Thank you for contributing to GPU Sentinel Pro!

↑ Back to top

Dockerfile (396 B)

FROM python:3.10-slim

WORKDIR /app

# Copy requirements first for better caching
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY backend/ backend/
COPY src/ src/

# Set environment variables
ENV PYTHONPATH=/app

# Run both collector and API server
CMD ["sh", "-c", "python -m src.collector.collector & python -m backend.src.service.app"]

↑ Back to top

LICENSE (1.0 KiB)

MIT License

Copyright (c) 2025 Jack Crawford

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

↑ Back to top

README.md (5.0 KiB)

GPU Sentinel Pro

"Information should not be displayed all at once; let people gradually become familiar with it." - Edward Tufte

Transform GPU monitoring from complex metrics into intuitive visual patterns. Enterprise-grade NVIDIA GPU monitoring with real-time analytics, intelligent alerts, and historical analysis.

Dark Mode Dashboard Real-time GPU metrics visualized for instant comprehension

Quick Start

Prerequisites

NVIDIA GPU with compute capability 3.0 or higher
NVIDIA Driver 450.80.02 or higher
Python 3.8+ and Node.js 16.0+
4GB RAM (8GB recommended)
1GB free disk space

Installation

Clone the repository:

git clone git@github.com:jackccrawford/gpu-sentinel-pro.git
cd gpu-sentinel-pro

Set up the backend:

cd backend
python -m venv venv
source venv/bin/activate  # On Windows: .\venv\Scripts\activate
pip install -r requirements.txt

Set up the frontend:

cd frontend
npm install

Start the services:

# Terminal 1 - Backend
cd backend
python src/service/app.py

# Terminal 2 - Frontend
cd frontend
npm run dev

Access the dashboard at http://localhost:5173

Pro Features

🎯 Enterprise-Grade Monitoring

Real-time Visual Dashboard
Modern React components with Material UI
Responsive design for desktop and mobile
Dark/light mode with automatic system preference detection
Multi-GPU support with individual monitoring panels
Advanced Metrics
Temperature and utilization with color-coded ranges
Memory usage and bandwidth monitoring
Power consumption and efficiency tracking
Process-level GPU utilization
Custom metric aggregation

🔔 Intelligent Alert System

Configurable Thresholds json { "temperature": { "warning": 75, "critical": 85 }, "memory": { "warning": 85, "critical": 95 } }
Alert Types
Temperature spikes
Memory leaks
Process crashes
Power anomalies
Custom conditions

📊 Analytics & Reporting

Historical Data
Time-series metrics storage
Customizable retention policies
Data export in multiple formats
Trend analysis and forecasting
Performance Insights
Workload pattern recognition
Resource utilization heatmaps
Efficiency recommendations
Cost analysis tools

🛠 Enterprise Integration

API Access
RESTful API with OpenAPI documentation
Secure authentication
Rate limiting and quotas
Webhook support
Security
Role-based access control
Audit logging
SSL/TLS encryption
Regular security updates

Configuration

Backend Settings

# config.py
SETTINGS = {
    'update_interval': 1000,  # ms
    'retention_period': '30d',
    'log_level': 'INFO',
    'enable_analytics': True,
    'alert_cooldown': 300,  # seconds
}

Frontend Configuration

// config.ts
export const CONFIG = {
  API_URL: 'http://localhost:8000',
  REFRESH_RATE: 1000,
  THEME_MODE: 'system',  // 'light' | 'dark' | 'system'
  CHART_HISTORY: 300,    // data points
};

System Architecture

graph TD
    A[Frontend React App] -->|HTTP/WebSocket| B[FastAPI Backend]
    B -->|NVML| C[GPU Hardware]
    B -->|Time Series| D[Supabase]
    B -->|Alerts| E[Notification Service]

Contributing

Fork the repository
Create a feature branch: git checkout -b feature/amazing-feature
Commit your changes: git commit -m 'feat: add amazing feature'
Push to the branch: git push origin feature/amazing-feature
Open a pull request

Support

License

This project is licensed under the MIT License - see the LICENSE file for details.

↑ Back to top

SECURITY.md (2.6 KiB)

Security Policy

Supported Versions

Version	Supported
1.0.x	:white_check_mark:

Reporting a Vulnerability

We take security seriously at GPU Sentinel Pro. If you discover a security vulnerability, please follow these steps:

Do Not create a public GitHub issue
Send details to [security@example.com] (to be replaced with actual security contact)
Include:
Description of the vulnerability
Steps to reproduce
Potential impact
Suggested fix (if any)

Response Timeline

Initial response: Within 48 hours
Status update: Within 5 business days
Fix timeline: Based on severity
Critical: 7 days
High: 14 days
Medium: 30 days
Low: Next release

Security Best Practices

Production Deployment

Authentication
Use secure authentication methods
Implement rate limiting
Enable MFA where applicable
Network Security
Use HTTPS/TLS
Configure proper CORS settings
Implement firewall rules
Database Security
Use strong passwords
Regular backups
Encryption at rest
Limited network access
API Security
Input validation
Output sanitization
Token-based authentication
Rate limiting

Development Security

Code Security
Regular dependency updates
Code scanning enabled
No secrets in code
Type checking enabled
Access Control
Principle of least privilege
Regular access review
Secure credential storage
Data Protection
Sensitive data encryption
Secure data transmission
Regular data cleanup

Security Features

Current Implementation

Input validation
SQL injection protection
XSS protection
CORS configuration
Rate limiting

Planned Features

[ ] API authentication
[ ] User role management
[ ] Audit logging
[ ] Enhanced encryption
[ ] Automated security scanning

Vulnerability Disclosure

We follow a responsible disclosure process:

Reporter submits vulnerability
Acknowledgment sent
Investigation conducted
Fix developed and tested
Fix deployed
Reporter notified
Public disclosure (if appropriate)

Security Compliance

Follow OWASP guidelines
Regular security audits
Dependency vulnerability scanning
Code security analysis

Contact

Security issues: [security@example.com] General issues: GitHub Issues

Recognition

We maintain a security hall of fame for responsible disclosure of vulnerabilities.

Updates

This security policy is reviewed and updated quarterly.

Last updated: February 2024

↑ Back to top

TODO.md (2.8 KiB)

GPU Sentinel Pro - Development Roadmap

Core System Reliability

[x] System Health Check & Diagnostics
[x] Graceful handling of missing NVIDIA drivers
[x] System requirements verification
[x] Driver version compatibility check
[x] Service connectivity status dashboard
[x] Auto-recovery procedures
[x] Installation troubleshooting guide

Data Management

[x] Logging Control Features
[x] Pause/Resume Supabase logging
[x] Data retention policy configuration
[x] Manual data export functionality
[x] Historical data cleanup tools
[x] Backup and restore capabilities

Alert System

[x] Alert Management Interface
[x] Alert history viewer
[x] Alert configuration dashboard
[x] Custom alert rules builder
[x] Notification preferences
[x] Alert acknowledgment workflow
[x] Alert severity configuration
[x] Email/webhook integration

Analytics & Insights

[x] Performance Analysis Tools
[x] Historical performance graphing
[x] Temperature trend analysis
[x] Power efficiency metrics
[x] Usage pattern recognition
[x] Performance anomaly detection
[x] Resource utilization heatmaps
[x] Cost analysis (power consumption)

Advanced Features

[ ] Workload Management
[ ] GPU task scheduling
[ ] Resource allocation optimization
[ ] Multi-user access control
[ ] Custom dashboard layouts
[ ] Performance benchmarking
[ ] Predictive maintenance
[ ] Container orchestration integration

Integration & Extensions

[ ] External System Integration
[ ] Kubernetes integration
[ ] Docker container stats
[ ] CI/CD pipeline monitoring
[ ] Cloud service provider metrics
[ ] External monitoring systems

Documentation

[ ] User Guides
[ ] Installation guides for different platforms
[ ] Configuration documentation
[ ] API documentation
[ ] Troubleshooting guides
[ ] Best practices

Development Infrastructure

[ ] Development Tools
[ ] Automated testing suite
[ ] CI/CD pipeline
[ ] Code quality checks
[ ] Performance testing framework
[ ] Development environment setup scripts

Priority Queue (Next Up)

System Health Check implementation
Logging Control Features
Basic Alert Management
Historical Data Analysis
Advanced Features

Notes

Features will be implemented based on community feedback and real-world usage patterns
Security considerations will be integrated into each feature
Performance impact will be evaluated for each new feature
Backward compatibility will be maintained where possible

Contributing

See CONTRIBUTING.md for guidelines on how to contribute to these features.

Feature Requests

Please use the GitHub Issues page to submit new feature requests or vote on existing ones.

↑ Back to top

backend/README.md (523 B)

GPU Metrics Service Backend

A FastAPI service that monitors NVIDIA GPUs and provides metrics via REST API.

Setup

Install dependencies:

pip install -r requirements.txt

Start the service:

python src/service/app.py

API Endpoints

GET /api/gpu-stats - Current GPU metrics
GET /api/gpu-stats/history - Historical metrics
GET /api/alerts - Recent alerts

Configuration

Edit src/service/config.yaml to customize: - Alert thresholds - Polling intervals - Data retention

↑ Back to top

backend/migrations/001_create_gpu_metrics_table.sql (1.1 KiB)

-- Create an extension for generating UUIDs if not exists
create extension if not exists "uuid-ossp";

-- Create the gpu_metrics table
create table if not exists gpu_metrics (
    id uuid primary key default uuid_generate_v4(),
    timestamp timestamptz not null default now(),
    
    -- GPU Burn Metrics
    duration integer not null,
    errors integer not null,
    running boolean not null,
    
    -- Nvidia Info
    cuda_version text not null,
    driver_version text not null,
    
    -- GPU Metrics Array (stored as JSONB)
    gpus jsonb not null,
    
    -- Additional fields
    processes jsonb default '[]'::jsonb,
    success boolean not null,
    
    -- Indexes for common queries
    created_at timestamptz not null default now()
);

-- Create indexes for better query performance
create index if not exists idx_gpu_metrics_timestamp on gpu_metrics(timestamp);
create index if not exists idx_gpu_metrics_created_at on gpu_metrics(created_at);

-- Add a comment to the table
comment on table gpu_metrics is 'Stores GPU metrics data collected from NVIDIA GPUs';

↑ Back to top

backend/migrations/002_create_alerts_table.sql (3.5 KiB)

-- Create alerts table
CREATE TABLE IF NOT EXISTS alert_thresholds (
    id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
    metric_name TEXT NOT NULL,
    warning_threshold FLOAT NOT NULL,
    critical_threshold FLOAT NOT NULL,
    duration_seconds INTEGER NOT NULL DEFAULT 60,
    enabled BOOLEAN NOT NULL DEFAULT true,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Create alerts history table
CREATE TABLE IF NOT EXISTS alert_history (
    id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
    alert_threshold_id uuid REFERENCES alert_thresholds(id),
    gpu_index INTEGER NOT NULL,
    metric_value FLOAT NOT NULL,
    threshold_value FLOAT NOT NULL,
    severity TEXT NOT NULL,  -- 'warning' or 'critical'
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Create index for querying recent alerts
CREATE INDEX IF NOT EXISTS idx_alert_history_created_at 
ON alert_history(created_at);

-- Create function to cleanup old alert history
CREATE OR REPLACE FUNCTION cleanup_old_alerts(days_to_keep INTEGER)
RETURNS void AS $$
BEGIN
    DELETE FROM alert_history 
    WHERE created_at < NOW() - (days_to_keep || ' days')::INTERVAL;
END;
$$ LANGUAGE plpgsql;

-- Create function to update alert thresholds from config
CREATE OR REPLACE FUNCTION update_alert_thresholds_from_config(
    config jsonb
) RETURNS void AS $$
BEGIN
    -- Temperature alerts
    INSERT INTO alert_thresholds (
        metric_name, 
        warning_threshold, 
        critical_threshold, 
        duration_seconds
    ) VALUES (
        'temperature',
        (config->'alerts'->>'temperature_warning')::float,
        (config->'alerts'->>'temperature_critical')::float,
        (config->'alerts'->>'temperature_duration')::integer
    ) ON CONFLICT (metric_name) DO UPDATE SET
        warning_threshold = EXCLUDED.warning_threshold,
        critical_threshold = EXCLUDED.critical_threshold,
        duration_seconds = EXCLUDED.duration_seconds,
        updated_at = NOW();

    -- GPU utilization alerts
    INSERT INTO alert_thresholds (
        metric_name, 
        warning_threshold, 
        critical_threshold, 
        duration_seconds
    ) VALUES (
        'gpu_utilization',
        (config->'alerts'->>'gpu_utilization_warning')::float,
        (config->'alerts'->>'gpu_utilization_critical')::float,
        (config->'alerts'->>'gpu_utilization_duration')::integer
    ) ON CONFLICT (metric_name) DO UPDATE SET
        warning_threshold = EXCLUDED.warning_threshold,
        critical_threshold = EXCLUDED.critical_threshold,
        duration_seconds = EXCLUDED.duration_seconds,
        updated_at = NOW();

    -- Memory usage alerts
    INSERT INTO alert_thresholds (
        metric_name, 
        warning_threshold, 
        critical_threshold, 
        duration_seconds
    ) VALUES (
        'memory_usage',
        (config->'alerts'->>'memory_usage_warning')::float,
        (config->'alerts'->>'memory_usage_critical')::float,
        (config->'alerts'->>'memory_usage_duration')::integer
    ) ON CONFLICT (metric_name) DO UPDATE SET
        warning_threshold = EXCLUDED.warning_threshold,
        critical_threshold = EXCLUDED.critical_threshold,
        duration_seconds = EXCLUDED.duration_seconds,
        updated_at = NOW();
END;
$$ LANGUAGE plpgsql;

-- Add unique constraint for metric_name
ALTER TABLE alert_thresholds 
ADD CONSTRAINT unique_metric_name UNIQUE (metric_name);

COMMENT ON TABLE alert_thresholds IS 'Stores configurable alert thresholds for GPU metrics';
COMMENT ON TABLE alert_history IS 'Stores history of triggered alerts';

↑ Back to top

backend/requirements.txt (280 B)

fastapi==0.109.2
uvicorn==0.27.1
psycopg2-binary==2.9.9
pyyaml==6.0.1
python-dotenv==1.0.1
pynvml==11.5.0
pandas==2.2.0
numpy==1.26.3
sqlalchemy==1.4.51
alembic==1.13.1
python-jose==3.3.0
fastapi-utils==0.2.1
prometheus-client==0.19.0
aiohttp==3.9.3
asyncpg==0.29.0
psutil==5.9.8

↑ Back to top

backend/src/init.py (0 B)

↑ Back to top

backend/src/database/init.py (0 B)

↑ Back to top

backend/src/database/client.py (2.8 KiB)

import psycopg2
from psycopg2.extras import Json
from datetime import datetime
from ..models.gpu_metrics import GpuMetricsRecord

class DatabaseClient:
    def __init__(self):
        self.conn_params = {
            'dbname': 'postgres',
            'user': 'postgres',
            'password': 'postgres',
            'host': 'localhost',
            'port': 54432
        }

    def get_connection(self):
        return psycopg2.connect(**self.conn_params)

    def insert_gpu_metrics(self, metrics: GpuMetricsRecord) -> dict:
        """
        Insert GPU metrics into PostgreSQL
        Returns the inserted record
        """
        if not metrics.timestamp:
            metrics.timestamp = datetime.utcnow().isoformat()

        data = metrics.model_dump()
        
        with self.get_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    INSERT INTO gpu_metrics (
                        timestamp,
                        duration,
                        errors,
                        running,
                        cuda_version,
                        driver_version,
                        gpus,
                        processes,
                        success,
                        created_at
                    ) VALUES (
                        %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW()
                    ) RETURNING id
                """, (
                    data['timestamp'],
                    data['gpu_burn_metrics']['duration'],
                    data['gpu_burn_metrics']['errors'],
                    data['gpu_burn_metrics']['running'],
                    data['nvidia_info']['cuda_version'],
                    data['nvidia_info']['driver_version'],
                    Json(data['gpus']),
                    Json(data['processes']),
                    data['success']
                ))
                record_id = cur.fetchone()[0]
                return {"id": record_id}

    def get_metrics_in_timerange(self, start_time: str, end_time: str):
        """
        Retrieve metrics within a specific time range
        """
        with self.get_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT *
                    FROM gpu_metrics
                    WHERE timestamp >= %s AND timestamp <= %s
                    ORDER BY timestamp DESC
                """, (start_time, end_time))
                
                columns = [desc[0] for desc in cur.description]
                results = []
                
                for row in cur.fetchall():
                    result = dict(zip(columns, row))
                    results.append(result)
                
                return results

# Create a singleton instance
db = DatabaseClient()

↑ Back to top

backend/src/database/config.py (267 B)

import os
from dotenv import load_dotenv

load_dotenv()

SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')

if not SUPABASE_URL or not SUPABASE_KEY:
    raise ValueError("Missing required Supabase credentials in environment variables")

↑ Back to top

backend/src/database/test_connection.py (1.2 KiB)

import os
from dotenv import load_dotenv
from supabase import create_client, Client
from datetime import datetime

# Load environment variables
load_dotenv()

# Get Supabase credentials
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')

def test_supabase_connection():
    try:
        if not SUPABASE_URL or not SUPABASE_KEY:
            print("Error: Missing Supabase credentials in .env file")
            return False

        # Initialize Supabase client
        supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
        
        # Try a simple query to test connection
        test_data = {
            "timestamp": datetime.utcnow().isoformat(),
            "test": True,
            "message": "Connection test"
        }
        
        print("Attempting to connect to Supabase...")
        result = supabase.table('gpu_metrics').insert(test_data).execute()
        
        print("Successfully connected to Supabase!")
        print("\nTest record inserted:")
        print(result)
        return True

    except Exception as e:
        print(f"Error connecting to Supabase: {e}")
        return False

if __name__ == "__main__":
    test_supabase_connection()

↑ Back to top

backend/src/models/init.py (0 B)

↑ Back to top

backend/src/models/gpu_metrics.py (786 B)

from typing import List, Optional
from pydantic import BaseModel


class GpuBurnMetrics(BaseModel):
    duration: int
    errors: int
    running: bool


class NvidiaInfo(BaseModel):
    cuda_version: str
    driver_version: str


class GpuMetrics(BaseModel):
    compute_mode: str
    fan_speed: int
    gpu_utilization: int
    index: int
    memory_total: int
    memory_used: int
    name: str
    peak_temperature: int
    power_draw: float
    power_limit: int
    temp_change_rate: int
    temperature: int


class GpuMetricsRecord(BaseModel):
    gpu_burn_metrics: GpuBurnMetrics
    gpus: List[GpuMetrics]
    nvidia_info: NvidiaInfo
    processes: List[dict] = []
    success: bool
    timestamp: Optional[str] = None  # We'll add this for tracking when the record was created

↑ Back to top

backend/src/service/init.py (0 B)

↑ Back to top

backend/src/service/alert_manager.py (6.2 KiB)

from datetime import datetime, timedelta
import logging
from .config import config
from src.database.client import db

logger = logging.getLogger(__name__)

class AlertManager:
    def __init__(self):
        self.alert_cache = {}  # {metric_name_gpu_index: last_alert_time}
        self.load_config()

    def load_config(self):
        """Load alert configuration"""
        self.thresholds = config.get('alerts')
        logger.info("Alert thresholds loaded from config")

    def check_metrics(self, gpu_metrics):
        """Check GPU metrics against thresholds"""
        alerts = []
        current_time = datetime.utcnow()

        for gpu in gpu_metrics.gpus:
            # Temperature checks
            alerts.extend(self._check_metric(
                metric_name='temperature',
                metric_value=gpu.temperature,
                gpu_index=gpu.index,
                warning=self.thresholds['temperature']['warning'],
                critical=self.thresholds['temperature']['critical'],
                duration=self.thresholds['temperature']['duration'],
                current_time=current_time
            ))

            # GPU utilization checks
            alerts.extend(self._check_metric(
                metric_name='gpu_utilization',
                metric_value=gpu.gpu_utilization,
                gpu_index=gpu.index,
                warning=self.thresholds['gpu_utilization']['warning'],
                critical=self.thresholds['gpu_utilization']['critical'],
                duration=self.thresholds['gpu_utilization']['duration'],
                current_time=current_time
            ))

            # Memory usage checks
            memory_usage_percent = (gpu.memory_used / gpu.memory_total) * 100
            alerts.extend(self._check_metric(
                metric_name='memory_usage',
                metric_value=memory_usage_percent,
                gpu_index=gpu.index,
                warning=self.thresholds['memory_usage']['warning'],
                critical=self.thresholds['memory_usage']['critical'],
                duration=self.thresholds['memory_usage']['duration'],
                current_time=current_time
            ))

            # Power usage checks
            power_usage_percent = (gpu.power_draw / gpu.power_limit) * 100
            alerts.extend(self._check_metric(
                metric_name='power_draw',
                metric_value=power_usage_percent,
                gpu_index=gpu.index,
                warning=self.thresholds['power_draw']['warning'],
                critical=self.thresholds['power_draw']['critical'],
                duration=self.thresholds['power_draw']['duration'],
                current_time=current_time
            ))

        if alerts:
            self._store_alerts(alerts)
            logger.warning(f"Generated {len(alerts)} alerts")

        return alerts

    def _check_metric(self, metric_name, metric_value, gpu_index, warning, critical, 
                     duration, current_time):
        """Check a single metric against its thresholds"""
        alerts = []
        cache_key = f"{metric_name}_gpu{gpu_index}"
        
        # Check if we should alert based on duration
        last_alert = self.alert_cache.get(cache_key)
        should_alert = (
            last_alert is None or 
            (current_time - last_alert).total_seconds() >= duration
        )

        if not should_alert:
            return alerts

        if metric_value >= critical:
            alerts.append({
                'metric_name': metric_name,
                'gpu_index': gpu_index,
                'value': metric_value,
                'threshold': critical,
                'severity': 'critical',
                'timestamp': current_time
            })
            self.alert_cache[cache_key] = current_time
        elif metric_value >= warning:
            alerts.append({
                'metric_name': metric_name,
                'gpu_index': gpu_index,
                'value': metric_value,
                'threshold': warning,
                'severity': 'warning',
                'timestamp': current_time
            })
            self.alert_cache[cache_key] = current_time

        return alerts

    def _store_alerts(self, alerts):
        """Store alerts in the database"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    for alert in alerts:
                        cur.execute("""
                            INSERT INTO alert_history (
                                gpu_index, metric_value, threshold_value, 
                                severity, created_at
                            ) VALUES (%s, %s, %s, %s, %s)
                        """, (
                            alert['gpu_index'],
                            alert['value'],
                            alert['threshold'],
                            alert['severity'],
                            alert['timestamp']
                        ))
        except Exception as e:
            logger.error(f"Failed to store alerts: {e}")

    def cleanup_old_alerts(self):
        """Clean up old alerts based on retention config"""
        try:
            retention_days = config.get('retention', 'days_to_keep')
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("SELECT cleanup_old_alerts(%s)", (retention_days,))
            logger.info(f"Cleaned up alerts older than {retention_days} days")
        except Exception as e:
            logger.error(f"Failed to cleanup old alerts: {e}")

    def get_recent_alerts(self, hours=24):
        """Get recent alerts"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        SELECT * FROM alert_history
                        WHERE created_at > NOW() - interval '%s hours'
                        ORDER BY created_at DESC
                    """, (hours,))
                    columns = [desc[0] for desc in cur.description]
                    return [dict(zip(columns, row)) for row in cur.fetchall()]
        except Exception as e:
            logger.error(f"Failed to get recent alerts: {e}")
            return []

# Create singleton instance
alert_manager = AlertManager()

↑ Back to top

backend/src/service/alerts.py (6.6 KiB)

from datetime import datetime, timedelta
import logging
from typing import List, Dict, Any
from src.service.settings import settings
from src.database.client import db
from src.models.gpu_metrics import GpuMetricsRecord

logger = logging.getLogger(__name__)

class AlertLevel:
    CRITICAL = "critical"
    WARNING = "warning"
    CAUTION = "caution"
    GOOD = "good"
    IDEAL = "ideal"

class AlertSystem:
    def __init__(self):
        # Cache structure: {f"{gpu_index}:{metric}:{severity}": timestamp}
        self.alert_cache = {}
        # Minimum time between similar alerts (5 minutes)
        self.alert_cooldown = timedelta(minutes=5)

    def should_trigger_alert(self, gpu_index: int, metric: str, 
                           severity: str, value: float) -> bool:
        """Determine if an alert should be triggered based on cache and cooldown"""
        cache_key = f"{gpu_index}:{metric}:{severity}"
        current_time = datetime.utcnow()

        # If no previous alert, always trigger
        if cache_key not in self.alert_cache:
            self.alert_cache[cache_key] = current_time
            return True

        # Check if enough time has passed since last alert
        last_alert_time = self.alert_cache[cache_key]
        if current_time - last_alert_time >= self.alert_cooldown:
            self.alert_cache[cache_key] = current_time
            return True

        return False

    def get_metric_level(self, metric: str, value: float) -> str:
        """Determine alert level for any metric based on thresholds"""
        thresholds = settings.get('alerts', metric)
        if value >= thresholds['critical']:
            return AlertLevel.CRITICAL
        elif value >= thresholds['warning']:
            return AlertLevel.WARNING
        elif value >= thresholds['caution']:
            return AlertLevel.CAUTION
        elif value >= thresholds['good']:
            return AlertLevel.GOOD
        return AlertLevel.IDEAL

    def check_metrics(self, metrics: GpuMetricsRecord) -> List[Dict[str, Any]]:
        """Check GPU metrics against all threshold levels"""
        alerts = []
        current_time = datetime.utcnow()

        for gpu in metrics.gpus:
            # Temperature check
            temp_level = self.get_metric_level('temperature', gpu.temperature)
            if temp_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'temperature', temp_level, gpu.temperature):
                    alerts.append(self._create_alert(
                        'temperature', gpu.index, gpu.temperature,
                        settings.get('alerts', 'temperature', temp_level),
                        temp_level, current_time
                    ))

            # GPU utilization check
            util_level = self.get_metric_level('gpu_utilization', gpu.gpu_utilization)
            if util_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'gpu_utilization', util_level, gpu.gpu_utilization):
                    alerts.append(self._create_alert(
                        'gpu_utilization', gpu.index, gpu.gpu_utilization,
                        settings.get('alerts', 'gpu_utilization', util_level),
                        util_level, current_time
                    ))

            # Fan speed check
            fan_level = self.get_metric_level('fan_speed', gpu.fan_speed)
            if fan_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'fan_speed', fan_level, gpu.fan_speed):
                    alerts.append(self._create_alert(
                        'fan_speed', gpu.index, gpu.fan_speed,
                        settings.get('alerts', 'fan_speed', fan_level),
                        fan_level, current_time
                    ))

            # Memory usage check
            memory_percent = (gpu.memory_used / gpu.memory_total) * 100
            mem_level = self.get_metric_level('memory_usage', memory_percent)
            if mem_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'memory_usage', mem_level, memory_percent):
                    alerts.append(self._create_alert(
                        'memory_usage', gpu.index, memory_percent,
                        settings.get('alerts', 'memory_usage', mem_level),
                        mem_level, current_time
                    ))

        if alerts:
            self._store_alerts(alerts)
            logger.warning(f"Generated {len(alerts)} alerts")

        return alerts

    def _create_alert(self, metric: str, gpu_index: int, value: float, 
                     threshold: float, severity: str, timestamp: datetime) -> Dict[str, Any]:
        """Create alert dictionary"""
        return {
            'metric': metric,
            'gpu_index': gpu_index,
            'value': value,
            'threshold': threshold,
            'severity': severity,
            'timestamp': timestamp
        }

    def _store_alerts(self, alerts: List[Dict[str, Any]]):
        """Store alerts in database"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    for alert in alerts:
                        cur.execute("""
                            INSERT INTO alert_history (
                                gpu_index, metric_value, threshold_value, 
                                severity, created_at
                            ) VALUES (%s, %s, %s, %s, %s)
                        """, (
                            alert['gpu_index'],
                            alert['value'],
                            alert['threshold'],
                            alert['severity'],
                            alert['timestamp']
                        ))
        except Exception as e:
            logger.error(f"Failed to store alerts: {e}")

    def get_recent_alerts(self, hours: int = 24) -> List[Dict[str, Any]]:
        """Get recent alerts from database"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        SELECT * FROM alert_history
                        WHERE created_at > NOW() - interval '%s hours'
                        ORDER BY created_at DESC
                    """, (hours,))
                    columns = [desc[0] for desc in cur.description]
                    return [dict(zip(columns, row)) for row in cur.fetchall()]
        except Exception as e:
            logger.error(f"Failed to get recent alerts: {e}")
            return []

# Create singleton instance
alert_system = AlertSystem()

↑ Back to top

backend/src/service/analytics_service.py (8.2 KiB)

import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
import pandas as pd
import numpy as np
from scipy import stats
from .config import config
from src.database.client import db

logger = logging.getLogger(__name__)

class AnalyticsService:
    def __init__(self):
        self.anomaly_threshold = 2.0  # Standard deviations for anomaly detection

    def get_historical_metrics(self, start_time: datetime, 
                             end_time: datetime) -> pd.DataFrame:
        """Fetch historical GPU metrics within the specified time range."""
        try:
            with db.get_connection() as conn:
                query = """
                    SELECT 
                        timestamp,
                        jsonb_array_elements(gpus) as gpu_data
                    FROM gpu_metrics
                    WHERE timestamp BETWEEN %s AND %s
                    ORDER BY timestamp
                """
                df = pd.read_sql_query(query, conn, params=(start_time, end_time))
                
                # Parse GPU data from JSONB
                df['gpu_data'] = df['gpu_data'].apply(eval)
                metrics_df = pd.json_normalize(df['gpu_data'])
                
                # Combine with timestamp
                metrics_df['timestamp'] = df['timestamp']
                
                return metrics_df
        except Exception as e:
            logger.error(f"Error fetching historical metrics: {e}")
            return pd.DataFrame()

    def calculate_usage_patterns(self, days: int = 7) -> Dict:
        """Calculate GPU usage patterns over time."""
        end_time = datetime.now()
        start_time = end_time - timedelta(days=days)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return {}

        patterns = {
            'hourly_avg': self._calculate_hourly_averages(df),
            'daily_avg': self._calculate_daily_averages(df),
            'peak_usage_times': self._find_peak_usage_times(df),
            'utilization_distribution': self._calculate_utilization_distribution(df)
        }
        
        return patterns

    def detect_anomalies(self, hours: int = 24) -> List[Dict]:
        """Detect anomalies in GPU metrics."""
        end_time = datetime.now()
        start_time = end_time - timedelta(hours=hours)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return []

        anomalies = []
        
        # Check for anomalies in different metrics
        metrics = ['utilization', 'temperature', 'memory_used', 'power_draw']
        for metric in metrics:
            if metric in df.columns:
                anomalies.extend(
                    self._detect_metric_anomalies(df, metric)
                )
        
        return anomalies

    def analyze_performance_trends(self, days: int = 30) -> Dict:
        """Analyze long-term performance trends."""
        end_time = datetime.now()
        start_time = end_time - timedelta(days=days)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return {}

        trends = {
            'utilization_trend': self._calculate_trend(df, 'utilization'),
            'temperature_trend': self._calculate_trend(df, 'temperature'),
            'memory_trend': self._calculate_trend(df, 'memory_used'),
            'power_trend': self._calculate_trend(df, 'power_draw')
        }
        
        return trends

    def calculate_efficiency_metrics(self, days: int = 7) -> Dict:
        """Calculate GPU efficiency metrics."""
        end_time = datetime.now()
        start_time = end_time - timedelta(days=days)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return {}

        metrics = {}
        
        # Calculate power efficiency (GFLOPS/Watt if available)
        if all(col in df.columns for col in ['power_draw', 'utilization']):
            metrics['power_efficiency'] = self._calculate_power_efficiency(df)
        
        # Calculate memory efficiency
        if all(col in df.columns for col in ['memory_used', 'memory_total']):
            metrics['memory_efficiency'] = self._calculate_memory_efficiency(df)
        
        return metrics

    def _calculate_hourly_averages(self, df: pd.DataFrame) -> Dict:
        """Calculate average metrics by hour of day."""
        df['hour'] = df['timestamp'].dt.hour
        hourly_avg = df.groupby('hour').agg({
            'utilization': 'mean',
            'temperature': 'mean',
            'memory_used': 'mean',
            'power_draw': 'mean'
        }).to_dict()
        
        return hourly_avg

    def _calculate_daily_averages(self, df: pd.DataFrame) -> Dict:
        """Calculate average metrics by day of week."""
        df['day'] = df['timestamp'].dt.dayofweek
        daily_avg = df.groupby('day').agg({
            'utilization': 'mean',
            'temperature': 'mean',
            'memory_used': 'mean',
            'power_draw': 'mean'
        }).to_dict()
        
        return daily_avg

    def _find_peak_usage_times(self, df: pd.DataFrame) -> List[Dict]:
        """Find times of peak GPU usage."""
        peaks = []
        metrics = ['utilization', 'temperature', 'memory_used', 'power_draw']
        
        for metric in metrics:
            if metric in df.columns:
                peak_idx = df[metric].idxmax()
                peaks.append({
                    'metric': metric,
                    'value': df.loc[peak_idx, metric],
                    'timestamp': df.loc[peak_idx, 'timestamp']
                })
        
        return peaks

    def _calculate_utilization_distribution(self, df: pd.DataFrame) -> Dict:
        """Calculate distribution of GPU utilization."""
        if 'utilization' not in df.columns:
            return {}

        bins = [0, 20, 40, 60, 80, 100]
        labels = ['0-20%', '21-40%', '41-60%', '61-80%', '81-100%']
        df['util_bin'] = pd.cut(df['utilization'], bins=bins, labels=labels)
        
        distribution = df['util_bin'].value_counts().to_dict()
        return {str(k): v for k, v in distribution.items()}

    def _detect_metric_anomalies(self, df: pd.DataFrame, 
                               metric: str) -> List[Dict]:
        """Detect anomalies in a specific metric."""
        if metric not in df.columns:
            return []

        mean = df[metric].mean()
        std = df[metric].std()
        threshold = self.anomaly_threshold * std
        
        anomalies = []
        anomaly_points = df[abs(df[metric] - mean) > threshold]
        
        for idx, row in anomaly_points.iterrows():
            anomalies.append({
                'metric': metric,
                'value': row[metric],
                'timestamp': row['timestamp'],
                'deviation': abs(row[metric] - mean) / std
            })
        
        return anomalies

    def _calculate_trend(self, df: pd.DataFrame, metric: str) -> Dict:
        """Calculate trend for a specific metric."""
        if metric not in df.columns:
            return {}

        # Calculate linear regression
        x = np.arange(len(df))
        y = df[metric].values
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        
        return {
            'slope': slope,
            'r_squared': r_value**2,
            'p_value': p_value,
            'trend_direction': 'increasing' if slope > 0 else 'decreasing',
            'significance': p_value < 0.05
        }

    def _calculate_power_efficiency(self, df: pd.DataFrame) -> Dict:
        """Calculate power efficiency metrics."""
        efficiency = (df['utilization'] / df['power_draw']).mean()
        return {
            'avg_efficiency': efficiency,
            'peak_efficiency': (df['utilization'] / df['power_draw']).max()
        }

    def _calculate_memory_efficiency(self, df: pd.DataFrame) -> Dict:
        """Calculate memory efficiency metrics."""
        memory_util = (df['memory_used'] / df['memory_total']).mean() * 100
        return {
            'avg_memory_utilization': memory_util,
            'peak_memory_utilization': (df['memory_used'] / df['memory_total']).max() * 100
        }

# Create singleton instance
analytics_service = AnalyticsService()

↑ Back to top

backend/src/service/app.py (10.8 KiB)

import sys
from pathlib import Path
backend_dir = str(Path(__file__).resolve().parent.parent.parent)
if backend_dir not in sys.path:
    sys.path.insert(0, backend_dir)

from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
from fastapi.responses import JSONResponse
import subprocess
import json
import re
from collections import deque
from datetime import datetime, timedelta
from typing import Optional, List, Dict
import logging
import os
import shutil

# Import our components
from src.database.client import db
from src.models.gpu_metrics import GpuMetricsRecord, GpuBurnMetrics, NvidiaInfo, GpuMetrics
from src.service.alerts import alert_system
from src.service.system_health import SystemHealthCheck

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Initialize system health checker for nvidia-smi operations
system_health = SystemHealthCheck()

app = FastAPI(
    title="GPU Sentinel Pro API",
    description="""
    Enterprise-grade NVIDIA GPU monitoring API with real-time analytics and alerts.
    
    ## Features
    * Real-time GPU metrics monitoring
    * Historical data analysis
    * Alert system with configurable thresholds
    * System health diagnostics
    
    ## Authentication
    All endpoints are currently open. For enterprise deployments, configure authentication 
    as needed.
    
    ## Rate Limiting
    Default rate limit: 100 requests per minute per IP
    """,
    version="1.0.0",
    docs_url=None,  # Disable default docs to use custom endpoint
    redoc_url=None  # Disable default redoc to use custom endpoint
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# In-memory state
temperature_history = {}
peak_temperatures = {}
logging_enabled = True

# Custom documentation endpoints
@app.get("/docs", include_in_schema=False)
async def custom_swagger_ui_html():
    return get_swagger_ui_html(
        openapi_url="/openapi.json",
        title="GPU Sentinel Pro - API Documentation",
        swagger_js_url="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui-bundle.js",
        swagger_css_url="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui.css",
    )

@app.get("/redoc", include_in_schema=False)
async def redoc_html():
    return get_redoc_html(
        openapi_url="/openapi.json",
        title="GPU Sentinel Pro - API Documentation",
        redoc_js_url="https://cdn.jsdelivr.net/npm/redoc@next/bundles/redoc.standalone.js",
    )

def get_nvidia_info() -> NvidiaInfo:
    try:
        result = system_health._run_nvidia_command([system_health.nvidia_smi_path])
        cuda_version = "Unknown"
        driver_version = "Unknown"
        
        if result.stdout:
            cuda_match = re.search(r'CUDA Version: ([\d\.]+)', result.stdout)
            if cuda_match:
                cuda_version = cuda_match.group(1)
            
            driver_match = re.search(r'Driver Version: ([\d\.]+)', result.stdout)
            if driver_match:
                driver_version = driver_match.group(1)

        return NvidiaInfo(
            driver_version=driver_version,
            cuda_version=cuda_version
        )
    except Exception as e:
        logger.error(f"Error getting NVIDIA info: {str(e)}")
        return NvidiaInfo(
            driver_version="Unknown",
            cuda_version="Unknown"
        )

def get_gpu_metrics() -> GpuMetricsRecord:
    try:
        nvidia_info = get_nvidia_info()
        
        gpu_info = system_health._run_nvidia_command([
            system_health.nvidia_smi_path,
            "--query-gpu=index,name,fan.speed,power.draw,memory.total,memory.used,utilization.gpu,temperature.gpu,compute_mode,power.limit",
            "--format=csv,noheader,nounits"
        ])

        gpus = []
        current_time = datetime.now().timestamp()
        
        if gpu_info.stdout.strip():
            for line in gpu_info.stdout.strip().split('\n'):
                values = [v.strip() for v in line.split(',')]
                if len(values) >= 10:
                    gpu_index = int(values[0])
                    temperature = float(values[7])
                    
                    if gpu_index not in temperature_history:
                        temperature_history[gpu_index] = deque(maxlen=40)
                    temperature_history[gpu_index].append((current_time, temperature))
                    
                    if gpu_index not in peak_temperatures or temperature > peak_temperatures[gpu_index]:
                        peak_temperatures[gpu_index] = temperature

                    gpu = GpuMetrics(
                        index=gpu_index,
                        name=values[1],
                        fan_speed=int(float(values[2])),
                        power_draw=float(values[3]),
                        power_limit=int(float(values[9])),
                        memory_total=int(float(values[4])),
                        memory_used=int(float(values[5])),
                        gpu_utilization=int(float(values[6])),
                        temperature=int(temperature),
                        peak_temperature=int(peak_temperatures[gpu_index]),
                        temp_change_rate=0,
                        compute_mode=values[8]
                    )
                    gpus.append(gpu)

        metrics = GpuMetricsRecord(
            nvidia_info=nvidia_info,
            gpus=gpus,
            processes=[],
            gpu_burn_metrics=GpuBurnMetrics(
                running=False,
                duration=0,
                errors=0
            ),
            success=True,
            timestamp=datetime.utcnow().isoformat()
        )

        # Check for alerts
        alert_system.check_metrics(metrics)

        # Store in database only if logging is enabled
        if logging_enabled:
            try:
                db.insert_gpu_metrics(metrics)
                logger.info("Metrics stored in database")
            except Exception as e:
                logger.error(f"Failed to store metrics: {e}")
        else:
            logger.debug("Metrics logging is disabled, skipping database insert")

        return metrics
    except Exception as e:
        logger.error(f"Error getting GPU metrics: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/gpu-stats", 
    response_model=List[GpuMetrics],
    tags=["Metrics"],
    summary="Get current GPU statistics",
    description="Returns real-time metrics for all available NVIDIA GPUs including temperature, utilization, memory usage, and power consumption."
)
async def get_gpu_stats():
    """Service information and status"""
    try:
        metrics = get_gpu_metrics()
        return metrics.gpus
    except Exception as e:
        logger.error(f"Error getting GPU stats: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/gpu-stats/history",
    response_model=List[GpuMetricsRecord],
    tags=["Metrics"],
    summary="Get historical GPU metrics",
    description="""
    Retrieve historical GPU metrics within a specified time range.
    
    - Use ISO format for dates (e.g., 2025-02-08T20:00:00Z)
    - Default lookback period is 24 hours
    - Maximum lookback period is 168 hours (1 week)
    """
)
async def get_gpu_history(
    start_time: Optional[str] = Query(
        None,
        description="Start time in ISO format (default: 24 hours ago)"
    ),
    end_time: Optional[str] = Query(
        None,
        description="End time in ISO format (default: current time)"
    ),
    hours: Optional[int] = Query(
        24,
        description="Number of hours to look back (used if start_time not provided)",
        ge=1,
        le=168  # 1 week max
    )
):
    """Get historical GPU metrics"""
    try:
        # If no start_time provided, use hours parameter
        if not start_time:
            start_time = (datetime.utcnow() - timedelta(hours=hours)).isoformat()
        
        # If no end_time provided, use current time
        if not end_time:
            end_time = datetime.utcnow().isoformat()

        # Validate and parse timestamps
        try:
            datetime.fromisoformat(start_time.replace('Z', '+00:00'))
            datetime.fromisoformat(end_time.replace('Z', '+00:00'))
        except ValueError:
            raise HTTPException(
                status_code=400,
                detail="Invalid timestamp format. Use ISO format (e.g., 2024-01-01T00:00:00Z)"
            )

        return db.get_metrics_in_timerange(start_time, end_time)
    except Exception as e:
        logger.error(f"Error getting historical data: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/alerts",
    response_model=List[Dict],
    tags=["Alerts"],
    summary="Get recent alerts",
    description="Retrieve alerts generated within the specified time period. Includes temperature spikes, resource constraints, and system health issues."
)
async def get_alerts(
    hours: int = Query(
        24,
        description="Number of hours to look back",
        ge=1,
        le=168
    )
):
    """Get recent alerts"""
    return alert_system.get_recent_alerts(hours)

@app.post("/api/logging/toggle",
    response_model=Dict[str, bool],
    tags=["System"],
    summary="Toggle metrics logging",
    description="Enable or disable metrics logging to the database. Useful for maintenance or debugging."
)
async def toggle_logging():
    """Toggle metrics logging"""
    global logging_enabled
    logging_enabled = not logging_enabled
    logger.info(f"Metrics logging {'enabled' if logging_enabled else 'disabled'}")
    return {"logging_enabled": logging_enabled}

@app.get("/api/logging/status",
    response_model=Dict[str, bool],
    tags=["System"],
    summary="Get logging status",
    description="Check if metrics logging is currently enabled or disabled."
)
async def get_logging_status():
    """Get logging status"""
    return {"logging_enabled": logging_enabled}

@app.get("/")
async def root():
    """Service information and status"""
    return {
        "name": "GPU Metrics Service",
        "version": "1.0.0",
        "status": "running",
        "endpoints": {
            "GET /api/gpu-stats": "Current GPU metrics",
            "GET /api/gpu-stats/history": "Historical GPU metrics (optional: start_time, end_time, hours=24)",
            "GET /api/alerts": "Recent alerts",
            "GET /api/logging/status": "Get current logging status",
            "POST /api/logging/toggle": "Toggle metrics logging"
        }
    }

if __name__ == "__main__":
    import uvicorn
    logger.info("Starting GPU Metrics Service")
    uvicorn.run(
        "app:app",
        host="0.0.0.0",
        port=5183,
        reload=True
    )

↑ Back to top

backend/src/service/config.py (868 B)

import yaml
import os
from pathlib import Path

class Config:
    _instance = None
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(Config, cls).__new__(cls)
            cls._instance._load_config()
        return cls._instance
    
    def _load_config(self):
        config_path = Path(__file__).parent / 'config.yaml'
        with open(config_path, 'r') as f:
            self._config = yaml.safe_load(f)
            
    def get(self, *keys):
        """Get a config value using dot notation, e.g., config.get('polling', 'base_interval')"""
        value = self._config
        for key in keys:
            value = value[key]
        return value
    
    def reload(self):
        """Reload configuration file"""
        self._load_config()
        return self._config

# Create singleton instance
config = Config()

↑ Back to top

backend/src/service/config.yaml (655 B)

# Alert thresholds matching frontend
alerts:
  temperature:
    critical: 80
    warning: 70
    caution: 60
    good: 50
    # below 50 is ideal

  gpu_utilization:
    critical: 90
    warning: 75
    caution: 50
    good: 25
    # below 25 is ideal

  fan_speed:
    critical: 80
    warning: 65
    caution: 50
    good: 35
    # below 35 is ideal

  memory_usage:
    critical: 90
    warning: 75
    caution: 50
    good: 25
    # below 25 is ideal

# Polling intervals
polling:
  base_interval: 0.25  # 250ms
  max_interval: 10.0   # 10 seconds

# Data retention
retention:
  days_to_keep: 30
  cleanup_on_startup: true
  cleanup_on_shutdown: true

↑ Back to top

backend/src/service/gpu_service.log (238 B)

Traceback (most recent call last):
  File "/home/explora/dev/mvllc/git/gpu-sentinel-pro/backend/src/service/app.py", line 7, in <module>
    from fastapi import FastAPI, HTTPException, Query
ModuleNotFoundError: No module named 'fastapi'

↑ Back to top

backend/src/service/logging_manager.py (6.7 KiB)

import logging
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import psycopg2
from psycopg2.extras import Json
from contextlib import contextmanager
import yaml

logger = logging.getLogger(__name__)

class LoggingManager:
    def __init__(self, config_path: str = "config.yaml"):
        self.config_path = config_path
        self.config = self._load_config()
        self.is_logging_enabled = True
        self._setup_db_connection()

    def _load_config(self) -> Dict:
        """Load configuration from YAML file."""
        try:
            with open(self.config_path, 'r') as f:
                return yaml.safe_load(f)
        except Exception as e:
            logger.error(f"Error loading config: {e}")
            return {}

    def _setup_db_connection(self):
        """Setup database connection parameters."""
        db_config = self.config.get('database', {})
        self.db_params = {
            'dbname': db_config.get('name', 'gpu_sentinel'),
            'user': db_config.get('user', 'postgres'),
            'password': db_config.get('password', ''),
            'host': db_config.get('host', 'localhost'),
            'port': db_config.get('port', 5432)
        }

    @contextmanager
    def get_db_connection(self):
        """Context manager for database connections."""
        conn = None
        try:
            conn = psycopg2.connect(**self.db_params)
            yield conn
        except Exception as e:
            logger.error(f"Database connection error: {e}")
            raise
        finally:
            if conn:
                conn.close()

    def toggle_logging(self, enabled: bool) -> bool:
        """Enable or disable logging."""
        self.is_logging_enabled = enabled
        return self.is_logging_enabled

    def log_gpu_metrics(self, metrics: Dict) -> bool:
        """Log GPU metrics to database if logging is enabled."""
        if not self.is_logging_enabled:
            return False

        try:
            with self.get_db_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        INSERT INTO gpu_metrics (
                            timestamp, duration, errors, running,
                            cuda_version, driver_version, gpus, processes, success
                        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """, (
                        datetime.now(),
                        metrics.get('duration', 0),
                        metrics.get('errors', 0),
                        metrics.get('running', False),
                        metrics.get('cuda_version', ''),
                        metrics.get('driver_version', ''),
                        Json(metrics.get('gpus', [])),
                        Json(metrics.get('processes', [])),
                        metrics.get('success', False)
                    ))
                conn.commit()
            return True
        except Exception as e:
            logger.error(f"Error logging GPU metrics: {e}")
            return False

    def get_retention_policy(self) -> Dict[str, int]:
        """Get current data retention policy."""
        return {
            'metrics_retention_days': self.config.get('retention', {}).get('metrics_days', 30),
            'alerts_retention_days': self.config.get('retention', {}).get('alerts_days', 90)
        }

    def update_retention_policy(self, metrics_days: int, alerts_days: int) -> bool:
        """Update data retention policy."""
        try:
            self.config['retention'] = {
                'metrics_days': metrics_days,
                'alerts_days': alerts_days
            }
            with open(self.config_path, 'w') as f:
                yaml.dump(self.config, f)
            return True
        except Exception as e:
            logger.error(f"Error updating retention policy: {e}")
            return False

    def cleanup_old_data(self) -> Dict[str, int]:
        """Clean up data based on retention policy."""
        retention = self.get_retention_policy()
        deleted_counts = {'metrics': 0, 'alerts': 0}

        try:
            with self.get_db_connection() as conn:
                with conn.cursor() as cur:
                    # Clean up old metrics
                    cur.execute("""
                        DELETE FROM gpu_metrics
                        WHERE timestamp < NOW() - INTERVAL '%s days'
                        RETURNING COUNT(*)
                    """, (retention['metrics_retention_days'],))
                    deleted_counts['metrics'] = cur.fetchone()[0]

                    # Clean up old alerts
                    cur.execute("""
                        DELETE FROM alert_history
                        WHERE created_at < NOW() - INTERVAL '%s days'
                        RETURNING COUNT(*)
                    """, (retention['alerts_retention_days'],))
                    deleted_counts['alerts'] = cur.fetchone()[0]

                conn.commit()
            return deleted_counts
        except Exception as e:
            logger.error(f"Error cleaning up old data: {e}")
            return deleted_counts

    def export_data(self, start_date: datetime, end_date: datetime, 
                   export_path: str) -> bool:
        """Export data within date range to JSON file."""
        try:
            with self.get_db_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        SELECT * FROM gpu_metrics
                        WHERE timestamp BETWEEN %s AND %s
                        ORDER BY timestamp
                    """, (start_date, end_date))
                    
                    columns = [desc[0] for desc in cur.description]
                    data = []
                    
                    for row in cur:
                        data.append(dict(zip(columns, row)))

            # Convert datetime objects to ISO format
            for record in data:
                record['timestamp'] = record['timestamp'].isoformat()
                record['created_at'] = record['created_at'].isoformat()

            with open(export_path, 'w') as f:
                json.dump(data, f, indent=2)

            return True
        except Exception as e:
            logger.error(f"Error exporting data: {e}")
            return False

# Example usage
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    manager = LoggingManager()
    
    # Example: Toggle logging
    manager.toggle_logging(False)
    
    # Example: Update retention policy
    manager.update_retention_policy(metrics_days=60, alerts_days=120)
    
    # Example: Clean up old data
    deleted = manager.cleanup_old_data()
    print(f"Cleaned up {deleted['metrics']} metrics and {deleted['alerts']} alerts")

↑ Back to top

backend/src/service/run_service.sh (433 B)

#!/bin/bash

# Directory of this script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# Activate virtual environment
source $DIR/../../venv/bin/activate

# Start the service in background with nohup
nohup python $DIR/app.py > $DIR/gpu_service.log 2>&1 &

# Save the PID to a file
echo $! > $DIR/service.pid

echo "GPU Metrics Service started with PID $(cat $DIR/service.pid)"
echo "Logs available at $DIR/gpu_service.log"

↑ Back to top

backend/src/service/service.pid (7 B)

↑ Back to top

backend/src/service/settings.py (2.0 KiB)

import yaml
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

class Settings:
    def __init__(self):
        self.config_path = Path(__file__).parent / 'config.yaml'
        self.load_config()

    def load_config(self):
        """Load configuration from yaml file"""
        try:
            with open(self.config_path, 'r') as f:
                self._config = yaml.safe_load(f)
                logger.info("Configuration loaded successfully")
        except Exception as e:
            logger.error(f"Error loading configuration: {e}")
            # Provide sensible defaults
            self._config = {
                'polling': {
                    'base_interval': 0.25,
                    'max_interval': 3600,
                    'activity_thresholds': {
                        'low': {'idle_time': 300, 'interval': 60},
                        'medium': {'idle_time': 1800, 'interval': 300},
                        'high': {'idle_time': 7200, 'interval': 3600}
                    }
                },
                'retention': {
                    'days_to_keep': 30,
                    'cleanup_on_startup': True,
                    'cleanup_on_shutdown': True
                },
                'alerts': {
                    'temperature': {'warning': 80, 'critical': 90},
                    'gpu_utilization': {'warning': 90, 'critical': 95},
                    'memory_usage': {'warning': 90, 'critical': 95}
                }
            }
            logger.info("Using default configuration")

    def get(self, *keys, default=None):
        """Get configuration value using dot notation"""
        try:
            value = self._config
            for key in keys:
                value = value[key]
            return value
        except (KeyError, TypeError):
            return default

    def reload(self):
        """Reload configuration file"""
        self.load_config()
        return self._config

# Create singleton instance
settings = Settings()

↑ Back to top

backend/src/service/stop_service.sh (414 B)

#!/bin/bash

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PID_FILE="$DIR/service.pid"

if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p $PID > /dev/null; then
        echo "Stopping GPU Metrics Service (PID: $PID)"
        kill $PID
        rm "$PID_FILE"
    else
        echo "Service not running (stale PID file)"
        rm "$PID_FILE"
    fi
else
    echo "No PID file found"
fi

↑ Back to top

backend/src/service/system_health.py (10.2 KiB)

import subprocess
import shutil
import logging
from typing import Dict, List, Optional
import re
import os

logger = logging.getLogger(__name__)

class SystemHealthCheck:
    def __init__(self):
        # Find nvidia-smi with full path and validate it
        self.nvidia_smi_path = shutil.which('nvidia-smi')
        if self.nvidia_smi_path:
            self.nvidia_smi_path = os.path.realpath(self.nvidia_smi_path)
            if not os.path.exists(self.nvidia_smi_path):
                self.nvidia_smi_path = None
            elif not os.access(self.nvidia_smi_path, os.X_OK):
                self.nvidia_smi_path = None
        self._driver_version = None
        self._cuda_version = None

    def _validate_nvidia_command(self, args: List[str]) -> bool:
        """Validate nvidia-smi command arguments"""
        valid_args = [
            "--query-gpu=gpu_name",
            "--query-gpu=gpu_name,gpu_bus_id,memory.total,compute_mode",
            "--query-gpu=driver_version",
            "--query-gpu=index,name,fan.speed,power.draw,memory.total,memory.used,utilization.gpu,temperature.gpu,compute_mode,power.limit",
            "--format=csv,noheader",
            "--format=csv,noheader,nounits"
        ]
        return all(arg in valid_args or arg == self.nvidia_smi_path for arg in args)

    def _run_nvidia_command(self, args: List[str]) -> subprocess.CompletedProcess:
        """Run nvidia-smi command with validation"""
        if not self.nvidia_smi_path:
            raise RuntimeError("nvidia-smi not found or not executable")
        
        if not self._validate_nvidia_command(args):
            raise ValueError("Invalid nvidia-smi arguments")

        return subprocess.run(
            args,
            capture_output=True,
            text=True,
            timeout=5,
            check=False  # We handle return code manually
        )

    def check_nvidia_smi(self) -> Dict[str, bool | str]:
        """Check if nvidia-smi is available and accessible."""
        if not self.nvidia_smi_path:
            logger.error("nvidia-smi not found in system PATH")
            return {
                "available": False,
                "error": "NVIDIA System Management Interface (nvidia-smi) not found. Please install NVIDIA drivers."
            }
        
        try:
            result = self._run_nvidia_command([
                self.nvidia_smi_path,
                "--query-gpu=gpu_name",
                "--format=csv,noheader"
            ])
            if result.returncode != 0:
                logger.error(f"nvidia-smi command failed: {result.stderr}")
                return {
                    "available": False,
                    "error": f"nvidia-smi command failed: {result.stderr}"
                }
            return {"available": True, "path": self.nvidia_smi_path}
        except subprocess.TimeoutExpired:
            logger.error("nvidia-smi command timed out")
            return {
                "available": False,
                "error": "nvidia-smi command timed out. System might be overloaded."
            }
        except Exception as e:
            logger.error(f"Error running nvidia-smi: {str(e)}")
            return {
                "available": False,
                "error": f"Error running nvidia-smi: {str(e)}"
            }

    def check_gpus(self) -> Dict[str, bool | List[str] | str]:
        """Check for available GPUs and their status."""
        if not self.nvidia_smi_path:
            return {
                "available": False,
                "error": "nvidia-smi not available",
                "gpus": []
            }

        try:
            result = self._run_nvidia_command([
                self.nvidia_smi_path,
                "--query-gpu=gpu_name,gpu_bus_id,memory.total,compute_mode",
                "--format=csv,noheader"
            ])
            
            if result.returncode != 0:
                return {
                    "available": False,
                    "error": f"GPU query failed: {result.stderr}",
                    "gpus": []
                }

            gpus = [gpu.strip() for gpu in result.stdout.split('\n') if gpu.strip()]
            
            if not gpus:
                return {
                    "available": False,
                    "error": "No GPUs detected",
                    "gpus": []
                }

            return {
                "available": True,
                "count": len(gpus),
                "gpus": gpus
            }

        except Exception as e:
            logger.error(f"Error checking GPUs: {str(e)}")
            return {
                "available": False,
                "error": f"Error checking GPUs: {str(e)}",
                "gpus": []
            }

    def check_driver_version(self) -> Dict[str, bool | str]:
        """Check NVIDIA driver version."""
        if not self.nvidia_smi_path:
            return {
                "available": False,
                "error": "nvidia-smi not available"
            }

        try:
            result = self._run_nvidia_command([
                self.nvidia_smi_path,
                "--query-gpu=driver_version",
                "--format=csv,noheader"
            ])
            
            if result.returncode != 0:
                return {
                    "available": False,
                    "error": f"Driver version query failed: {result.stderr}"
                }

            version = result.stdout.strip()
            if not version:
                return {
                    "available": False,
                    "error": "Could not determine driver version"
                }

            self._driver_version = version
            return {
                "available": True,
                "version": version
            }

        except Exception as e:
            logger.error(f"Error checking driver version: {str(e)}")
            return {
                "available": False,
                "error": f"Error checking driver version: {str(e)}"
            }

    def check_cuda_version(self) -> Dict[str, bool | str]:
        """Check CUDA version."""
        if not self.nvidia_smi_path:
            return {
                "available": False,
                "error": "nvidia-smi not available"
            }

        try:
            result = self._run_nvidia_command([self.nvidia_smi_path])
            
            if result.returncode != 0:
                return {
                    "available": False,
                    "error": f"CUDA version query failed: {result.stderr}"
                }

            # Look for CUDA Version in output
            cuda_match = re.search(r'CUDA Version:\s+(\d+\.\d+)', result.stdout)
            if not cuda_match:
                return {
                    "available": False,
                    "error": "Could not determine CUDA version"
                }

            self._cuda_version = cuda_match.group(1)
            return {
                "available": True,
                "version": self._cuda_version
            }

        except Exception as e:
            logger.error(f"Error checking CUDA version: {str(e)}")
            return {
                "available": False,
                "error": f"Error checking CUDA version: {str(e)}"
            }

    def check_memory_requirements(self) -> Dict[str, bool | str]:
        """Check if system meets memory requirements."""
        try:
            import psutil
            memory = psutil.virtual_memory()
            
            # Require at least 4GB of total RAM
            min_memory = 4 * 1024 * 1024 * 1024  # 4GB in bytes
            
            if memory.total < min_memory:
                return {
                    "meets_requirements": False,
                    "error": f"Insufficient system memory. Required: 4GB, Available: {memory.total / (1024**3):.1f}GB"
                }
                
            return {
                "meets_requirements": True,
                "total_memory": f"{memory.total / (1024**3):.1f}GB",
                "available_memory": f"{memory.available / (1024**3):.1f}GB"
            }
            
        except Exception as e:
            logger.error(f"Error checking system memory: {str(e)}")
            return {
                "meets_requirements": False,
                "error": f"Error checking system memory: {str(e)}"
            }

    def run_full_check(self) -> Dict[str, any]:
        """Run all system health checks."""
        return {
            "nvidia_smi": self.check_nvidia_smi(),
            "gpus": self.check_gpus(),
            "driver": self.check_driver_version(),
            "cuda": self.check_cuda_version(),
            "memory": self.check_memory_requirements(),
            "system_ready": all([
                self.check_nvidia_smi().get("available", False),
                self.check_gpus().get("available", False),
                self.check_driver_version().get("available", False),
                self.check_cuda_version().get("available", False),
                self.check_memory_requirements().get("meets_requirements", False)
            ])
        }

    def get_user_friendly_message(self, check_results: Dict[str, any]) -> str:
        """Generate a user-friendly message from check results."""
        if check_results["system_ready"]:
            return "System is ready for GPU monitoring."

        messages = []
        
        if not check_results["nvidia_smi"]["available"]:
            messages.append(f"NVIDIA SMI Issue: {check_results['nvidia_smi']['error']}")
        
        if not check_results["gpus"]["available"]:
            messages.append(f"GPU Issue: {check_results['gpus']['error']}")
        
        if not check_results["driver"]["available"]:
            messages.append(f"Driver Issue: {check_results['driver']['error']}")
        
        if not check_results["cuda"]["available"]:
            messages.append(f"CUDA Issue: {check_results['cuda']['error']}")
        
        if not check_results["memory"]["meets_requirements"]:
            messages.append(f"Memory Issue: {check_results['memory']['error']}")

        return "\n".join(messages)

# Example usage:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    health_check = SystemHealthCheck()
    results = health_check.run_full_check()
    print(health_check.get_user_friendly_message(results))

↑ Back to top

backend/src/service/test_alerts.py (1.8 KiB)

import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.parent))

from src.service.alerts import alert_system
from src.models.gpu_metrics import GpuMetricsRecord, GpuMetrics, NvidiaInfo, GpuBurnMetrics
from datetime import datetime

def test_alerts():
    print("Testing alert system...")
    
    # Create test metrics with high temperature
    metrics = GpuMetricsRecord(
        gpu_burn_metrics=GpuBurnMetrics(
            duration=0,
            errors=0,
            running=False
        ),
        gpus=[
            GpuMetrics(
                compute_mode="Default",
                fan_speed=100,  # High fan speed
                gpu_utilization=95,  # High utilization
                index=0,
                memory_total=12288,
                memory_used=11674,  # High memory usage
                name="NVIDIA TITAN Xp",
                peak_temperature=85,
                power_draw=240,
                power_limit=250,
                temp_change_rate=0,
                temperature=85  # High temperature
            )
        ],
        nvidia_info=NvidiaInfo(
            cuda_version="12.2",
            driver_version="535.183.01"
        ),
        processes=[],
        success=True,
        timestamp=datetime.utcnow().isoformat()
    )

    # Check for alerts
    alerts = alert_system.check_metrics(metrics)
    print(f"\nGenerated {len(alerts)} alerts:")
    for alert in alerts:
        print(f"Alert: {alert['metric']} on GPU {alert['gpu_index']}")
        print(f"Value: {alert['value']:.1f}, Threshold: {alert['threshold']}")
        print(f"Severity: {alert['severity']}\n")

    # Get recent alerts
    recent = alert_system.get_recent_alerts(hours=1)
    print(f"Recent alerts in database: {len(recent)}")

if __name__ == "__main__":
    test_alerts()

↑ Back to top

backend/src/service/test_db.py (2.0 KiB)

import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.parent))

from src.database.client import db
from src.models.gpu_metrics import GpuMetricsRecord, GpuBurnMetrics, NvidiaInfo, GpuMetrics
from datetime import datetime

def test_db_connection():
    try:
        # Create a test record
        metrics = GpuMetricsRecord(
            gpu_burn_metrics=GpuBurnMetrics(
                duration=0,
                errors=0,
                running=False
            ),
            gpus=[
                GpuMetrics(
                    compute_mode="Default",
                    fan_speed=30,
                    gpu_utilization=10,
                    index=0,
                    memory_total=12288,
                    memory_used=135,
                    name="NVIDIA TITAN Xp",
                    peak_temperature=48,
                    power_draw=67.17,
                    power_limit=250,
                    temp_change_rate=0,
                    temperature=48
                )
            ],
            nvidia_info=NvidiaInfo(
                cuda_version="12.2",
                driver_version="535.183.01"
            ),
            processes=[],
            success=True,
            timestamp=datetime.utcnow().isoformat()
        )

        print("Inserting test record...")
        result = db.insert_gpu_metrics(metrics)
        print(f"Insert successful, record ID: {result['id']}")

        print("\nRetrieving recent metrics...")
        recent = db.get_metrics_in_timerange(
            start_time=(datetime.utcnow().replace(hour=0, minute=0, second=0)).isoformat(),
            end_time=datetime.utcnow().isoformat()
        )
        print(f"Found {len(recent)} records today")
        
        return True
    except Exception as e:
        print(f"Error: {str(e)}")
        return False

if __name__ == "__main__":
    print("Testing database connection...")
    success = test_db_connection()
    print(f"\nTest {'successful' if success else 'failed'}")

↑ Back to top

backend/src/service/test_settings.py (627 B)

from settings import settings

def test_config():
    print("Testing configuration loading...")
    
    # Test basic config access
    base_interval = settings.get('polling', 'base_interval')
    print(f"Base polling interval: {base_interval}")
    
    # Test nested config access
    low_threshold = settings.get('polling', 'activity_thresholds', 'low', 'interval')
    print(f"Low activity polling interval: {low_threshold}")
    
    # Test default values
    unknown = settings.get('unknown', 'key', default='default_value')
    print(f"Unknown key with default: {unknown}")

if __name__ == "__main__":
    test_config()

↑ Back to top

docker-compose.yml (891 B)

version: '3.8'

services:
  backend:
    build: .
    container_name: gpu-sentinel-backend
    restart: always
    ports:
      - "5183:5183"
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    volumes:
      - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
      - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    networks:
      - gpu-net

  frontend:
    build: 
      context: ./frontend
      dockerfile: Dockerfile
    container_name: gpu-sentinel-frontend
    restart: always
    ports:
      - "5173:5173"
    depends_on:
      - backend
    networks:
      - gpu-net

networks:
  gpu-net:
    name: gpu-net
    external: true  # Use existing network where Ollama containers run

↑ Back to top

docs/API.md (3.6 KiB)

GPU Sentinel Pro API Documentation

API Overview

Base URL: http://localhost:5500

The GPU Sentinel Pro API provides real-time and historical GPU metrics through a RESTful interface.

Endpoints

Service Status

GET /

Returns service information and status.

Response Example:

{
    "name": "GPU Metrics Service",
    "version": "1.0.0",
    "status": "running",
    "endpoints": {
        "GET /api/gpu-stats": "Current GPU metrics",
        "GET /api/gpu-stats/history": "Historical GPU metrics",
        "GET /api/alerts": "Recent alerts"
    }
}

Current GPU Statistics

GET /api/gpu-stats

Returns real-time GPU metrics for all detected NVIDIA GPUs.

Response Example:

{
    "nvidia_info": {
        "driver_version": "535.183.01",
        "cuda_version": "12.2"
    },
    "gpus": [
        {
            "index": 0,
            "name": "NVIDIA GeForce RTX 3080",
            "fan_speed": 45,
            "power_draw": 125.5,
            "power_limit": 250,
            "memory_total": 10240,
            "memory_used": 3584,
            "gpu_utilization": 85,
            "temperature": 72,
            "peak_temperature": 75,
            "temp_change_rate": 0.5,
            "compute_mode": "Default"
        }
    ],
    "processes": [],
    "gpu_burn_metrics": {
        "running": false,
        "duration": 0,
        "errors": 0
    },
    "success": true,
    "timestamp": "2024-02-20T15:30:00Z"
}

Historical GPU Metrics

GET /api/gpu-stats/history

Retrieves historical GPU metrics within a specified time range.

Query Parameters: - start_time (optional): ISO format timestamp (e.g., "2024-02-20T00:00:00Z") - end_time (optional): ISO format timestamp (e.g., "2024-02-20T23:59:59Z") - hours (optional): Number of hours to look back (1-168, default: 24)

Response Example:

[
    {
        "timestamp": "2024-02-20T15:00:00Z",
        "gpu_metrics": {
            // Same structure as current GPU stats
        }
    }
]

Alert History

GET /api/alerts

Retrieves recent system alerts.

Query Parameters: - hours (optional): Number of hours of alert history to retrieve (default: 24)

Response Example:

[
    {
        "id": "alert-123",
        "timestamp": "2024-02-20T15:25:00Z",
        "severity": "warning",
        "message": "GPU temperature exceeded 80°C",
        "gpu_index": 0,
        "metric": "temperature",
        "value": 82,
        "threshold": 80
    }
]

Error Responses

Validation Error

{
    "detail": [
        {
            "loc": ["query", "hours"],
            "msg": "ensure this value is less than or equal to 168",
            "type": "value_error.number.not_le"
        }
    ]
}

Rate Limiting

Default: 100 requests per minute per IP
Historical data endpoints: 30 requests per minute per IP

Authentication

Currently using direct access. Token-based authentication planned for future releases.

Best Practices

Use appropriate polling intervals (recommended: ≥250ms)
Include error handling for all API calls
Implement exponential backoff for retries
Cache responses when appropriate
Use historical endpoints for trend analysis
Monitor rate limits in production environments

Future Endpoints (Planned)

POST /api/alerts/config - Configure alert thresholds
POST /api/logging/control - Control logging behavior
GET /api/metrics/analysis - Get performance analysis
POST /api/gpu/tasks - Manage GPU tasks

Support

For API issues or feature requests, please use our GitHub Issues page.

↑ Back to top

docs/INSTALLATION.md (3.5 KiB)

GPU Sentinel Pro Installation Guide

Prerequisites

System Requirements

NVIDIA GPU(s)
NVIDIA drivers installed and functional
nvidia-smi command available
Docker and Docker Compose (for database)
Python 3.10 or higher
Node.js 18 or higher

NVIDIA Driver Verification

nvidia-smi

Should display your GPU information. If not, install NVIDIA drivers first.

Quick Start

1. Clone the Repository

git clone https://github.com/jackccrawford/gpu-sentinel-pro.git
cd gpu-sentinel-pro

2. Database Setup (Supabase)

cd supabase
docker-compose up -d

Verify Supabase is running: - Database: http://localhost:54432 - API: http://localhost:54321

3. Backend Setup

# Create and activate virtual environment
cd backend
python -m venv venv
source venv/bin/activate  # On Windows: .\venv\Scripts\activate

# Install dependencies
pip install -r requirements.txt

# Start the service
cd src/service
./run_service.sh

Verify backend is running: - API: http://localhost:5500 - Documentation: http://localhost:5500/docs

4. Frontend Setup

cd frontend
npm install
./run_frontend.sh

Access the dashboard at http://localhost:3055

Configuration

Environment Variables

Create .env file in backend directory:

SUPABASE_URL=http://localhost:54321
SUPABASE_KEY=your_supabase_key

Database Migrations

cd backend/migrations
# Run migrations in order
psql -h localhost -p 54432 -U postgres -d postgres -f 001_create_gpu_metrics_table.sql
psql -h localhost -p 54432 -U postgres -d postgres -f 002_create_alerts_table.sql

Alert Configuration

Edit backend/src/service/config.yaml:

alerts:
  temperature:
    critical: 80
    warning: 70
    caution: 60

Service Management

Backend Service

Start: backend/src/service/run_service.sh
Stop: backend/src/service/stop_service.sh
Logs: backend/src/service/gpu_service.log

Frontend Service

Start: frontend/run_frontend.sh
Stop: frontend/stop_frontend.sh
Logs: frontend/frontend.log

Troubleshooting

Common Issues

NVIDIA Driver Not Found

# Check driver installation
nvidia-smi

# If not found, install drivers
sudo ubuntu-drivers autoinstall  # Ubuntu
# or
sudo dnf install nvidia-driver   # Fedora

Database Connection Issues

# Check Supabase containers
docker ps | grep supabase

# Check logs
docker logs supabase-db-1

# Reset database
cd supabase
docker-compose down -v
docker-compose up -d

Service Won't Start

Check logs in respective log files
Verify ports are not in use: bash netstat -tulpn | grep -E '5500|3055|54321|54432'
Ensure all dependencies are installed
Verify Python/Node.js versions

Security Considerations

Production Deployment

Use proper SSL/TLS certificates
Configure proper authentication
Set up proper firewall rules
Use secure database passwords
Enable rate limiting

Access Control

Configure CORS settings in backend/src/service/app.py
Set up proper database user permissions
Use environment variables for sensitive data

Updating

Backend Updates

git pull
cd backend
source venv/bin/activate
pip install -r requirements.txt
./src/service/run_service.sh

Frontend Updates

git pull
cd frontend
npm install
./run_frontend.sh

Support

GitHub Issues: Report bugs
Documentation: Full documentation

↑ Back to top

docs/README.md (2.3 KiB)

GPU Sentinel Pro Documentation

Welcome to the GPU Sentinel Pro documentation. This directory contains comprehensive documentation for users, developers, and contributors.

Quick Links

For Users

Installation Guide - Complete setup instructions
API Documentation - REST API endpoints and usage
Security Policy - Security guidelines and reporting
Changelog - Version history and updates

For Developers

Contributing Guide - How to contribute to the project
Development Roadmap - Planned features and enhancements

Documentation Structure

docs/
├── README.md          # This file - Documentation overview
├── API.md            # API endpoints and usage
├── INSTALLATION.md   # Installation and setup guide
└── ... (future docs)

project root/
├── CHANGELOG.md      # Version history
├── CONTRIBUTING.md   # Contribution guidelines
├── SECURITY.md      # Security policies
└── TODO.md          # Development roadmap

Documentation Updates

Our documentation follows these principles: - Clear and concise explanations - Practical examples and use cases - Regular updates with new features - Version-specific information when needed

Getting Started

New users should start with INSTALLATION.md
API users should refer to API.md
Contributors should read CONTRIBUTING.md
For planned features, see TODO.md

Documentation TODOs

[ ] Add troubleshooting guide
[ ] Add API examples collection
[ ] Add performance tuning guide
[ ] Add deployment best practices
[ ] Add architecture overview
[ ] Add user interface guide

Contributing to Docs

Documentation improvements are always welcome! Please see our Contributing Guide for details on: - Documentation style guide - How to submit documentation changes - Documentation testing - Translation guidelines

Support

If you find any issues in the documentation: 1. Check existing GitHub issues 2. Create a new issue if needed 3. Submit a pull request with fixes

License

This documentation is licensed under the same terms as GPU Sentinel Pro. See LICENSE for details.

↑ Back to top

docs/architecture/ARCHITECTURE.md (5.9 KiB)

GPU Sentinel Pro - System Architecture

System Overview

graph TB
    subgraph "Frontend Layer"
        R[React Application]
        V[Vite Dev Server]
    end

    subgraph "Backend Layer"
        F[FastAPI Server]
        N[NVIDIA SMI Interface]
        A[Alert Manager]
    end

    subgraph "Data Layer"
        S[(Supabase DB)]
        C[Cache Layer]
    end

    R -->|HTTP/WebSocket| F
    F -->|Query| S
    F -->|Commands| N
    F -->|Triggers| A
    A -->|Store| S
    F -->|Cache| C

Component Architecture

Frontend Components

graph TB
    subgraph "UI Layer"
        D[Dashboard]
        M[Metrics Display]
        A[Alert Panel]
        H[History View]
    end

    subgraph "State Management"
        Q[Query Client]
        S[State Store]
    end

    subgraph "Data Layer"
        AP[API Client]
        WS[WebSocket Client]
    end

    D --> M
    D --> A
    D --> H
    M --> Q
    A --> Q
    H --> Q
    Q --> AP
    Q --> WS
    Q --> S

Backend Services

graph LR
    subgraph "API Layer"
        E[Endpoints]
        M[Middleware]
        A[Auth]
    end

    subgraph "Core Services"
        GM[GPU Monitor]
        AM[Alert Manager]
        HM[History Manager]
    end

    subgraph "Infrastructure"
        DB[Database]
        C[Cache]
        N[NVIDIA SMI]
    end

    E --> M
    M --> A
    M --> GM
    M --> AM
    M --> HM
    GM --> N
    AM --> DB
    HM --> DB
    GM --> C

Data Flow

Real-time Metrics Flow

NVIDIA SMI polls GPU metrics (250ms intervals)
Backend processes and validates data
WebSocket pushes updates to frontend
React components re-render with new data
Metrics stored in time-series database

Alert Flow

Backend evaluates metrics against thresholds
Alert triggered if threshold exceeded
Alert stored in database
WebSocket pushes alert to frontend
Alert notification displayed
External notifications sent (email/webhook)

Technical Components

Frontend Stack

Framework: React 18+
Language: TypeScript 5+
Build Tool: Vite
State Management: React Query
UI Components: Custom components
Data Visualization: Custom charts
WebSocket Client: Native WebSocket

Backend Stack

Framework: FastAPI
Language: Python 3.10+
ASGI Server: Uvicorn
Task Queue: Background tasks
Caching: In-memory + Redis
Monitoring: Custom metrics

Database Schema

GPU Metrics Table

CREATE TABLE gpu_metrics (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMPTZ NOT NULL,
    gpu_id INTEGER NOT NULL,
    temperature FLOAT,
    memory_used BIGINT,
    memory_total BIGINT,
    gpu_utilization INTEGER,
    power_draw FLOAT,
    power_limit FLOAT,
    fan_speed INTEGER,
    metadata JSONB,
    created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE INDEX idx_gpu_metrics_timestamp 
    ON gpu_metrics (timestamp DESC);
CREATE INDEX idx_gpu_metrics_gpu_id 
    ON gpu_metrics (gpu_id);

Alerts Table

CREATE TABLE alerts (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMPTZ NOT NULL,
    gpu_id INTEGER NOT NULL,
    alert_type VARCHAR(50) NOT NULL,
    severity VARCHAR(20) NOT NULL,
    message TEXT NOT NULL,
    value FLOAT,
    threshold FLOAT,
    acknowledged BOOLEAN DEFAULT FALSE,
    acknowledged_at TIMESTAMPTZ,
    created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE INDEX idx_alerts_timestamp 
    ON alerts (timestamp DESC);
CREATE INDEX idx_alerts_gpu_id 
    ON alerts (gpu_id);

Security Architecture

Authentication Flow

Client requests access
Server validates credentials
JWT token issued
Token included in subsequent requests
Token refresh mechanism

Authorization Levels

Admin: Full system access
User: View and acknowledge alerts
Reader: View-only access
API: Programmatic access

Data Security

Encryption at rest
TLS for data in transit
Secure WebSocket connections
Rate limiting
Input validation

Deployment Architecture

Development Environment

graph LR
    D[Developer] --> L[Local Environment]
    L --> T[Tests]
    T --> G[Git]
    G --> A[GitHub Actions]

Production Environment

graph LR
    G[GitHub] --> A[GitHub Actions]
    A --> B[Build]
    B --> T[Test]
    T --> D[Deploy]
    D --> P[Production]

Performance Considerations

Frontend Optimization

Component memoization
Virtual scrolling for large datasets
Efficient re-rendering
Asset optimization
Code splitting

Backend Optimization

Connection pooling
Query optimization
Caching strategy
Async operations
Resource limits

Database Optimization

Partitioning strategy
Index optimization
Query performance
Data retention
Backup strategy

Monitoring and Logging

System Metrics

API response times
WebSocket performance
Database query times
Cache hit rates
Error rates

Application Logs

Request/response logging
Error tracking
Performance metrics
Security events
System health

Scalability Considerations

Horizontal Scaling

Stateless backend
Load balancing
Session management
Cache distribution
Database replication

Vertical Scaling

Resource optimization
Memory management
Connection pooling
Query optimization
Batch processing

Future Architecture Considerations

Planned Enhancements

Kubernetes integration
Cloud provider metrics
ML-based predictions
Advanced analytics
Custom dashboards

Technical Debt Management

Code quality metrics
Performance monitoring
Security scanning
Dependency updates
Documentation updates

Development Workflow

Code Pipeline

graph LR
    F[Feature Branch] --> T[Tests]
    T --> R[Review]
    R --> M[Main Branch]
    M --> D[Deploy]

Quality Assurance

Automated testing
Code review process
Performance testing
Security scanning
Documentation review

↑ Back to top

docs/requirements/DEVELOPMENT_GUIDE.md (6.5 KiB)

GPU Sentinel Pro - Development Guide

Development Environment Setup

Prerequisites

Python 3.10+
Node.js 18+
NVIDIA GPU with drivers installed
Docker and Docker Compose
VS Code (recommended)

Initial Setup

Clone and Configure

# Clone repository
git clone https://github.com/jackccrawford/gpu-sentinel-pro.git
cd gpu-sentinel-pro

# Create Python virtual environment
cd backend
python -m venv venv
source venv/bin/activate  # or .\venv\Scripts\activate on Windows

# Install backend dependencies
pip install -r requirements.txt

# Install frontend dependencies
cd ../frontend
npm install

Environment Configuration

# Backend (.env)
SUPABASE_URL=http://localhost:54321
SUPABASE_KEY=your-key
LOG_LEVEL=debug

# Frontend (.env)
VITE_API_URL=http://localhost:5500
VITE_UPDATE_INTERVAL=250

Code Style Guidelines

Python (Backend)

Style Guide

Follow PEP 8
Maximum line length: 100 characters
Use type hints
Use async/await for I/O operations

# Good
async def get_gpu_metrics(gpu_id: int) -> Dict[str, Any]:
    """
    Fetch metrics for specific GPU.

    Args:
        gpu_id: ID of the GPU to monitor

    Returns:
        Dict containing GPU metrics
    """
    metrics = await nvidia_smi.get_metrics(gpu_id)
    return process_metrics(metrics)

# Bad
def get_gpu_metrics(id):
    metrics = nvidia_smi.get_metrics(id)
    return process_metrics(metrics)

Error Handling

# Good
try:
    metrics = await nvidia_smi.get_metrics()
except NvidiaSMIError as e:
    logger.error(f"Failed to get GPU metrics: {e}")
    raise GPUError(str(e), "NVIDIA_SMI_ERROR")
except Exception as e:
    logger.error(f"Unexpected error: {e}")
    raise

# Bad
try:
    metrics = nvidia_smi.get_metrics()
except:
    print("Error")

TypeScript (Frontend)

Style Guide

Use functional components
Use TypeScript types/interfaces
Use React Query for data fetching
Maximum line length: 100 characters

// Good
interface MetricsDisplayProps {
  gpuId: number;
  refreshInterval: number;
}

const MetricsDisplay: React.FC<MetricsDisplayProps> = ({
  gpuId,
  refreshInterval,
}) => {
  const { data, error } = useQuery(['metrics', gpuId], fetchMetrics);

  if (error) return <ErrorDisplay error={error} />;
  if (!data) return <LoadingSpinner />;

  return <MetricsView data={data} />;
};

// Bad
function MetricsDisplay(props) {
  const [data, setData] = useState();
  useEffect(() => {
    fetch('/api/metrics').then(res => setData(res.data));
  }, []);
  return data ? <div>{data}</div> : null;
}

Testing Standards

Backend Testing

Unit Tests

# test_metrics.py
import pytest
from unittest.mock import Mock, patch

class TestMetricsCollector:
    @pytest.fixture
    def collector(self):
        return MetricsCollector()

    @patch('nvidia_smi.get_metrics')
    async def test_collection(self, mock_get_metrics, collector):
        mock_get_metrics.return_value = {'temperature': 75}
        metrics = await collector.collect_metrics()
        assert metrics['temperature'] == 75

Integration Tests

# test_api.py
from fastapi.testclient import TestClient
from .main import app

client = TestClient(app)

def test_metrics_endpoint():
    response = client.get("/api/gpu-stats")
    assert response.status_code == 200
    assert "gpus" in response.json()

Frontend Testing

Component Tests

// MetricsDisplay.test.tsx
import { render, screen } from '@testing-library/react';

describe('MetricsDisplay', () => {
  it('renders temperature correctly', () => {
    render(<MetricsDisplay gpuId={0} />);
    expect(screen.getByText(/Temperature/i)).toBeInTheDocument();
  });
});

Integration Tests

// App.test.tsx
import { renderWithProviders } from '../test-utils';

test('full app rendering', async () => {
  const { container } = renderWithProviders(<App />);
  expect(container).toBeInTheDocument();
});

Git Workflow

Branch Naming

Feature: feature/description
Bug Fix: fix/description
Documentation: docs/description
Performance: perf/description

Commit Messages

Follow conventional commits:

# Feature
git commit -m "feat: add temperature trend analysis"

# Bug fix
git commit -m "fix: correct memory usage calculation"

# Documentation
git commit -m "docs: update API documentation"

# Performance
git commit -m "perf: optimize metrics polling"

Pull Request Process

Create feature branch
Implement changes
Add tests
Update documentation
Create PR with description
Address review comments
Merge after approval

Debugging Guide

Backend Debugging

# Enable debug logging
import logging
logging.basicConfig(level=logging.DEBUG)

# Add debug points
logger.debug(f"Metrics collected: {metrics}")

# Use VS Code debugger
# launch.json configuration provided

Frontend Debugging

// Use React DevTools
// Chrome DevTools Configuration
{
  "react-developer-tools": true,
  "redux-devtools": true
}

// Debug logging
console.debug('Metrics updated:', metrics);

Performance Optimization

Backend Optimization

Use connection pooling
Implement caching
Optimize database queries
Use async I/O

Frontend Optimization

Implement memoization
Use React.memo for components
Optimize re-renders
Implement virtualization

Security Best Practices

Backend Security

Input validation
Rate limiting
Authentication
CORS configuration

Frontend Security

XSS prevention
CSRF protection
Secure storage
API error handling

Deployment Process

Development

Run tests
Update documentation
Create PR
Code review
Merge to main

Staging

Deploy to staging
Run integration tests
Performance testing
Security scanning

Production

Create release
Deploy to production
Monitor metrics
Verify functionality

Monitoring and Logging

Logging Standards

# Backend logging
logger.info("API request received", extra={
    "endpoint": "/api/metrics",
    "method": "GET",
    "user_id": user_id
})

# Frontend logging
console.info('Metrics updated', {
    timestamp: new Date(),
    metrics: metricsData
});

Monitoring Metrics

Response times
Error rates
Resource usage
User activity

Support and Maintenance

Issue Resolution

Reproduce issue
Identify root cause
Implement fix
Add regression test
Deploy solution

Regular Maintenance

Dependency updates
Security patches
Performance optimization
Documentation updates

↑ Back to top

docs/requirements/REQUIREMENTS.md (6.1 KiB)

GPU Sentinel Pro - User Requirements

Overview

This document outlines the user requirements for GPU Sentinel Pro, organized as Epics and User Stories following agile methodologies.

Epics

1. Real-Time Monitoring (E1)

Enable users to monitor GPU performance metrics in real-time with minimal cognitive load.

User Stories: - [E1.S1] As a ML engineer, I want to see real-time GPU utilization so I can monitor my training jobs - [E1.S2] As a data scientist, I want color-coded temperature indicators so I can quickly identify issues - [E1.S3] As a developer, I want to see memory usage patterns so I can detect memory leaks - [E1.S4] As a system admin, I want to monitor multiple GPUs simultaneously so I can manage cluster health - [E1.S5] As a user, I want dark/light mode options so I can comfortably monitor in any lighting condition

Acceptance Criteria: - Updates at least every 250ms - Clear visual indicators for critical metrics - Support for multi-GPU systems - Responsive design for different screen sizes - Configurable refresh rates

2. Alert System (E2)

Provide proactive notifications for critical GPU events and threshold breaches.

User Stories: - [E2.S1] As a system admin, I want to set custom alert thresholds so I can prevent hardware damage - [E2.S2] As a ML engineer, I want email notifications when training jobs complete or fail - [E2.S3] As a team lead, I want alert history so I can track system health patterns - [E2.S4] As a developer, I want webhook integration so I can connect alerts to our chat system - [E2.S5] As an admin, I want to configure alert severity levels so I can prioritize responses

Acceptance Criteria: - Configurable thresholds for all metrics - Multiple notification channels - Alert history retention - Severity level management - Alert acknowledgment system

3. Historical Analysis (E3)

Enable data-driven decisions through historical performance analysis.

User Stories: - [E3.S1] As an analyst, I want to view historical performance data so I can optimize resource allocation - [E3.S2] As a ML engineer, I want to analyze training job patterns so I can improve efficiency - [E3.S3] As a manager, I want performance reports so I can plan hardware upgrades - [E3.S4] As a developer, I want to export metrics so I can perform custom analysis - [E3.S5] As a user, I want to compare performance across time periods so I can identify trends

Acceptance Criteria: - Data retention configurable up to 30 days - Export functionality in multiple formats - Interactive visualization tools - Custom date range selection - Trend analysis capabilities

4. System Health Management (E4)

Provide comprehensive system health monitoring and management capabilities.

User Stories: - [E4.S1] As an admin, I want to pause/resume logging so I can manage database storage - [E4.S2] As a user, I want graceful handling of missing drivers so I can troubleshoot setup issues - [E4.S3] As a developer, I want API access to health metrics so I can integrate with other tools - [E4.S4] As an admin, I want backup/restore capabilities so I can preserve historical data - [E4.S5] As a user, I want system requirements verification so I can ensure proper setup

Acceptance Criteria: - Data management controls - Graceful error handling - RESTful API documentation - Data integrity protection - System diagnostics tools

5. Advanced Features (E5)

Provide enterprise-grade features for power users and organizations.

User Stories: - [E5.S1] As a team lead, I want multi-user access control so I can manage team permissions - [E5.S2] As a developer, I want custom dashboard layouts so I can focus on relevant metrics - [E5.S3] As an admin, I want integration with container orchestration so I can monitor containerized workloads - [E5.S4] As an analyst, I want predictive maintenance warnings so I can prevent failures - [E5.S5] As a manager, I want cost analysis tools so I can optimize resource spending

Acceptance Criteria: - Role-based access control - Customizable dashboards - Container metrics integration - Predictive analytics - Cost reporting tools

Priority Matrix

Priority	Epic	Rationale
P0	E1 - Real-Time Monitoring	Core functionality, immediate value
P1	E4 - System Health	Essential for reliability
P2	E2 - Alert System	Critical for proactive management
P3	E3 - Historical Analysis	Important for optimization
P4	E5 - Advanced Features	Enhanced value proposition

Technical Requirements

Performance

Frontend response time < 100ms
Backend processing time < 150ms
Support for up to 8 GPUs
Minimal resource overhead

Security

API authentication
Data encryption
Secure websocket connections
Access control management

Reliability

99.9% uptime target
Automatic error recovery
Data backup mechanisms
Graceful degradation

Scalability

Horizontal scaling support
Efficient data storage
Optimized query performance
Resource-aware monitoring

Implementation Phases

Phase 1: Foundation

Core monitoring functionality
Basic UI implementation
Database integration
Error handling

Phase 2: Enhancement

Alert system
Historical data
User authentication
API documentation

Phase 3: Advanced

Advanced analytics
Custom dashboards
Integration features
Predictive capabilities

Success Metrics

User Experience

UI response time < 100ms
Error rate < 0.1%
User satisfaction > 4.5/5

System Performance

CPU overhead < 5%
Memory usage < 500MB
Storage efficiency > 90%

Business Impact

Time saved in monitoring
Incident prevention rate
Resource optimization impact

Maintenance Requirements

Regular Updates

Security patches
Feature updates
Performance optimizations
Documentation updates

Support

Issue resolution
User assistance
Feature requests
Bug fixes

Future Considerations

Scalability

Cloud deployment options
Enterprise features
Additional integrations
Performance enhancements

Integration

CI/CD systems
Cloud providers
Monitoring platforms
Analytics tools

↑ Back to top

docs/requirements/TECHNICAL_SPEC.md (7.8 KiB)

GPU Sentinel Pro - Technical Specification

System Architecture

Component Overview

graph TD
    A[Frontend React App] --> B[FastAPI Backend]
    B --> C[Supabase Database]
    B --> D[NVIDIA SMI]
    B --> E[Alert System]
    E --> F[Email/Webhook]
    B --> G[Historical Analytics]

Technology Stack

Frontend
React 18+
TypeScript 5+
Vite
Real-time data visualization
Backend
FastAPI
Python 3.10+
NVIDIA SMI integration
WebSocket support
Database
Supabase (PostgreSQL)
Time-series optimization
Data partitioning

Core Features Implementation

1. Real-Time Monitoring

Data Collection

class GPUMetricsCollector:
    POLLING_INTERVAL = 250  # milliseconds

    async def collect_metrics(self):
        metrics = await nvidia_smi.get_metrics()
        return self.process_metrics(metrics)

Frontend Updates

interface MetricsState {
  current: GPUMetrics;
  history: MetricsHistory;
  alerts: Alert[];
}

const useMetrics = () => {
  const [metrics, setMetrics] = useState<MetricsState>();
  // Polling implementation
};

2. Alert System

Alert Rules Engine

class AlertRule:
    def __init__(self, metric: str, threshold: float, condition: Callable):
        self.metric = metric
        self.threshold = threshold
        self.condition = condition

    def evaluate(self, value: float) -> bool:
        return self.condition(value, self.threshold)

Notification System

class NotificationManager:
    async def send_alert(self, alert: Alert):
        match alert.channel:
            case "email":
                await self.send_email(alert)
            case "webhook":
                await self.send_webhook(alert)

3. Historical Analysis

Data Storage Schema

CREATE TABLE gpu_metrics (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMPTZ NOT NULL,
    gpu_id INTEGER NOT NULL,
    metric_type VARCHAR(50) NOT NULL,
    value DOUBLE PRECISION NOT NULL,
    metadata JSONB,
    CONSTRAINT unique_metric 
        UNIQUE (timestamp, gpu_id, metric_type)
);

CREATE INDEX idx_metrics_timestamp 
    ON gpu_metrics (timestamp DESC);

Analytics Queries

class MetricsAnalyzer:
    async def get_trends(
        self, 
        start_time: datetime,
        end_time: datetime,
        metric: str
    ) -> DataFrame:
        query = """
        SELECT 
            time_bucket('5 minutes', timestamp) AS interval,
            avg(value) as avg_value,
            max(value) as max_value,
            min(value) as min_value
        FROM gpu_metrics
        WHERE 
            timestamp BETWEEN $1 AND $2
            AND metric_type = $3
        GROUP BY interval
        ORDER BY interval;
        """
        return await self.db.fetch_all(query, start_time, end_time, metric)

Performance Optimizations

Backend Optimizations

class MetricsCache:
    def __init__(self):
        self.cache = TTLCache(maxsize=1000, ttl=300)  # 5 minutes

    async def get_or_fetch(self, key: str) -> dict:
        if key in self.cache:
            return self.cache[key]
        value = await self.fetch_from_db(key)
        self.cache[key] = value
        return value

Frontend Optimizations

const useMetricsOptimized = () => {
  const queryClient = useQueryClient();

  return useQuery({
    queryKey: ['metrics'],
    queryFn: fetchMetrics,
    staleTime: 250,
    cacheTime: 1000 * 60 * 5,
    refetchInterval: 250
  });
};

Error Handling

Backend Error Handling

class GPUError(Exception):
    def __init__(self, message: str, error_code: str):
        self.message = message
        self.error_code = error_code
        super().__init__(self.message)

async def handle_gpu_error(error: GPUError):
    log.error(f"GPU Error: {error.error_code} - {error.message}")
    return JSONResponse(
        status_code=500,
        content={
            "error": error.error_code,
            "message": error.message,
            "timestamp": datetime.utcnow().isoformat()
        }
    )

Frontend Error Boundaries

class MetricsErrorBoundary extends React.Component {
  state = { hasError: false, error: null };

  static getDerivedStateFromError(error: Error) {
    return { hasError: true, error };
  }

  render() {
    if (this.state.hasError) {
      return <ErrorDisplay error={this.state.error} />;
    }
    return this.props.children;
  }
}

Security Measures

API Authentication

class SecurityConfig:
    JWT_ALGORITHM = "HS256"
    JWT_EXPIRE_MINUTES = 60

    @staticmethod
    def create_access_token(data: dict) -> str:
        expire = datetime.utcnow() + timedelta(minutes=SecurityConfig.JWT_EXPIRE_MINUTES)
        to_encode = data.copy()
        to_encode.update({"exp": expire})
        return jwt.encode(to_encode, settings.SECRET_KEY, algorithm=SecurityConfig.JWT_ALGORITHM)

Data Validation

class MetricsValidator:
    @staticmethod
    def validate_gpu_metrics(metrics: dict) -> bool:
        required_fields = {'temperature', 'memory_used', 'gpu_utilization'}
        return all(
            isinstance(metrics.get(field), (int, float))
            for field in required_fields
        )

Deployment Configuration

Docker Setup

# Backend
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "5500"]

# Frontend
FROM node:20-alpine
WORKDIR /app
COPY package*.json ./
RUN npm install
COPY . .
RUN npm run build

Environment Configuration

# Backend config
SUPABASE_URL: ${SUPABASE_URL}
SUPABASE_KEY: ${SUPABASE_KEY}
LOG_LEVEL: info
METRICS_RETENTION_DAYS: 30
ALERT_COOLDOWN_MINUTES: 5

# Frontend config
VITE_API_URL: ${API_URL}
VITE_WS_URL: ${WS_URL}
VITE_UPDATE_INTERVAL: 250

Monitoring and Logging

Application Logging

class LogConfig:
    @staticmethod
    def setup_logging():
        logging.config.dictConfig({
            'version': 1,
            'disable_existing_loggers': False,
            'formatters': {
                'default': {
                    'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
                }
            },
            'handlers': {
                'console': {
                    'class': 'logging.StreamHandler',
                    'formatter': 'default'
                },
                'file': {
                    'class': 'logging.handlers.RotatingFileHandler',
                    'filename': 'gpu_service.log',
                    'maxBytes': 10485760,  # 10MB
                    'backupCount': 5,
                    'formatter': 'default'
                }
            },
            'root': {
                'level': 'INFO',
                'handlers': ['console', 'file']
            }
        })

Testing Strategy

Backend Tests

class TestGPUMetrics:
    @pytest.fixture
    def metrics_collector(self):
        return GPUMetricsCollector()

    async def test_metrics_collection(self, metrics_collector):
        metrics = await metrics_collector.collect_metrics()
        assert 'temperature' in metrics
        assert isinstance(metrics['temperature'], (int, float))

Frontend Tests

describe('MetricsDisplay', () => {
  it('should update metrics every 250ms', async () => {
    const { result } = renderHook(() => useMetrics());
    await waitFor(() => {
      expect(result.current.data).toBeDefined();
    });
    expect(result.current.data.temperature).toBeGreaterThanOrEqual(0);
  });
});

Performance Benchmarks

Target Metrics

API Response Time: < 100ms (95th percentile)
Frontend Render Time: < 50ms
Database Query Time: < 50ms
Memory Usage: < 500MB
CPU Usage: < 5% per core

↑ Back to top

frontend/Dockerfile (324 B)

FROM node:20-slim

WORKDIR /app

# Copy package files
COPY package*.json ./

# Install dependencies
RUN npm install

# Copy source code
COPY . .

# Build the application
RUN npm run build

# Install serve to run the built app
RUN npm install -g serve

# Serve the built application
CMD ["serve", "-s", "dist", "-l", "5173"]

↑ Back to top

frontend/README.md (557 B)

GPU Metrics Dashboard Frontend

React-based dashboard for monitoring NVIDIA GPUs.

Setup

Install dependencies:

npm install

Start the development server:

npm run dev

The dashboard will be available at http://localhost:5501

Configuration

The dashboard connects to the backend service at http://localhost:5500 by default. To change this, edit the API_URL in src/config.ts.

Features

Real-time GPU metrics display
Temperature, utilization, and memory monitoring
Historical data viewing
Alert notifications

↑ Back to top

frontend/eslint.config.js (734 B)

import js from '@eslint/js'
import globals from 'globals'
import reactHooks from 'eslint-plugin-react-hooks'
import reactRefresh from 'eslint-plugin-react-refresh'
import tseslint from 'typescript-eslint'

export default tseslint.config(
  { ignores: ['dist'] },
  {
    extends: [js.configs.recommended, ...tseslint.configs.recommended],
    files: ['**/*.{ts,tsx}'],
    languageOptions: {
      ecmaVersion: 2020,
      globals: globals.browser,
    },
    plugins: {
      'react-hooks': reactHooks,
      'react-refresh': reactRefresh,
    },
    rules: {
      ...reactHooks.configs.recommended.rules,
      'react-refresh/only-export-components': [
        'warn',
        { allowConstantExport: true },
      ],
    },
  },
)

↑ Back to top

frontend/frontend.log (158 B)

> gpu-metrics-dashboard@1.0.0 dev
> vite


  VITE v4.5.5  ready in 235 ms

  ➜  Local:   http://localhost:3055/
  ➜  Network: http://192.168.0.224:3055/

↑ Back to top

frontend/frontend.pid (7 B)

↑ Back to top

frontend/index.html (366 B)

<!doctype html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Vite + React + TS</title>
  </head>
  <body>
    <div id="root"></div>
    <script type="module" src="/src/main.tsx"></script>
  </body>
</html>

↑ Back to top

frontend/package.json (1.1 KiB)

{
  "name": "gpu-metrics-dashboard",
  "private": true,
  "version": "1.0.0",
  "type": "module",
  "scripts": {
    "dev": "vite",
    "build": "tsc && vite build",
    "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
    "preview": "vite preview"
  },
  "dependencies": {
    "@emotion/react": "^11.14.0",
    "@emotion/styled": "^11.14.0",
    "@mui/icons-material": "^5.16.14",
    "@mui/material": "^5.16.14",
    "axios": "^1.6.7",
    "date-fns": "^2.30.0",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
    "recharts": "^2.15.1",
    "zustand": "^4.4.7"
  },
  "devDependencies": {
    "@mui/types": "^7.2.21",
    "@tsconfig/node18": "^18.2.4",
    "@types/date-fns": "^2.5.3",
    "@types/react": "^18.2.15",
    "@types/react-dom": "^18.2.7",
    "@types/recharts": "^1.8.29",
    "@typescript-eslint/eslint-plugin": "^6.0.0",
    "@typescript-eslint/parser": "^6.0.0",
    "@vitejs/plugin-react": "^4.0.3",
    "eslint": "^8.45.0",
    "eslint-plugin-react-hooks": "^4.6.0",
    "eslint-plugin-react-refresh": "^0.4.3",
    "typescript": "^5.0.2",
    "vite": "^5.1.1"
  }
}

↑ Back to top

frontend/run_frontend.sh (811 B)

#!/bin/bash

# Directory of this script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
FRONTEND_PORT=3055
FRONTEND_LOG="frontend.log"
PID_FILE="frontend.pid"

cd $DIR

# Check if port is in use
if [[ $(lsof -i :${FRONTEND_PORT} | grep LISTEN) ]]; then
    echo "Port ${FRONTEND_PORT} is in use. Current processes:"
    lsof -i :${FRONTEND_PORT}
    read -p "Kill these processes? (y/N) " response
    if [[ "$response" =~ ^[Yy]$ ]]; then
        kill $(lsof -t -i :${FRONTEND_PORT})
        sleep 2
    else
        echo "Startup aborted"
        exit 1
    fi
fi

# Start the frontend service
nohup npm run dev > "${FRONTEND_LOG}" 2>&1 &
FRONTEND_PID=$!

# Store the PID
echo $FRONTEND_PID > "${PID_FILE}"
echo "Frontend started with PID ${FRONTEND_PID}"
echo "Logs available at $DIR/${FRONTEND_LOG}"

↑ Back to top

frontend/src/App.tsx (32.1 KiB)

import { useState, useEffect } from 'react'

const API_URL = 'http://localhost:5183'

/**
 * Represents comprehensive information about a single NVIDIA GPU
 * @interface GPUInfo
 */
interface GPUInfo {
  /** Unique index of the GPU in the system */
  index: number
  /** Full name/model of the GPU */
  name: string
  /** Current fan speed as a percentage (0-100) */
  fan_speed: number
  /** Current power consumption in watts */
  power_draw: number
  /** Maximum power limit in watts */
  power_limit: number
  /** Total GPU memory in megabytes */
  memory_total: number
  /** Currently used GPU memory in megabytes */
  memory_used: number
  /** Current GPU utilization as a percentage (0-100) */
  gpu_utilization: number
  /** Current GPU temperature in Celsius */
  temperature: number
  /** Highest recorded temperature in Celsius since last reset */
  peak_temperature: number
  /** Rate of temperature change in degrees Celsius per second */
  temp_change_rate: number
  /** Current compute mode of the GPU (e.g., 'Default', 'Exclusive Process') */
  compute_mode: string
}

/**
 * Metrics related to GPU stress testing/burn-in operations
 * @interface GPUBurnMetrics
 */
interface GPUBurnMetrics {
  /** Indicates if a GPU stress test is currently running */
  running: boolean
  /** Duration of the current/last stress test in seconds */
  duration: number
  /** Number of errors encountered during stress testing */
  errors: number
}

/**
 * Comprehensive GPU system information including all GPUs and system-wide metrics
 * @interface GPUData
 */
interface GPUData {
  /** Array of information for each GPU in the system */
  gpus: GPUInfo[]
  /** System-wide NVIDIA driver information */
  nvidia_info: {
    /** Installed NVIDIA driver version */
    driver_version: string
    /** Installed CUDA version */
    cuda_version: string
  }
  /** List of processes currently using GPU resources */
  processes: any[]
  /** Metrics from GPU stress testing */
  gpu_burn_metrics: GPUBurnMetrics
  /** Indicates if the data was retrieved successfully */
  success: boolean
}

/**
 * Theme configuration for the application's color scheme
 * @interface ThemeColors
 */
interface ThemeColors {
  /** Main background color of the application */
  background: string
  /** Background color for GPU info cards */
  cardBackground: string
  /** Primary text color */
  text: string
  /** Secondary/supplementary text color */
  subtext: string
  /** Border color for UI elements */
  border: string
  /** Background color for progress bar tracks */
  progressBackground: string
  /** Indicates if dark theme is active */
  isDark: boolean
}

const getColorScheme = (isDark: boolean) => ({
  critical: {
    light: '#DC2626', // deep red (visible on white)
    dark: '#FF6B6B'   // lighter red (visible on dark)
  },
  warning: {
    light: '#EA580C', // deep orange
    dark: '#FFA94D'   // lighter orange
  },
  caution: {
    light: '#CA8A04', // deep yellow-orange
    dark: '#FFD43B'   // lighter yellow
  },
  good: {
    light: '#16A34A', // deep green
    dark: '#51CF66'   // lighter green
  },
  ideal: {
    light: '#2563EB', // deep blue
    dark: '#339AF0'   // lighter blue
  }
});

const POLLING_INTERVALS = [
  { label: '250ms', value: 250 },
  { label: '500ms', value: 500 },
  { label: '1 second', value: 1000 },
  { label: '2 seconds', value: 2000 },
  { label: '5 seconds', value: 5000 },
  { label: '10 seconds', value: 10000 }
]

function App() {
  const [data, setData] = useState<GPUData | null>(null)
  const [error, setError] = useState<string | null>(null)
  const [darkMode, setDarkMode] = useState(() => {
    const saved = localStorage.getItem('darkMode')
    return saved ? JSON.parse(saved) : true
  })
  const [pollingInterval, setPollingInterval] = useState(() => {
    const saved = localStorage.getItem('pollingInterval')
    return saved ? parseInt(saved) : 1000
  })
  const [loggingEnabled, setLoggingEnabled] = useState(true)

  useEffect(() => {
    const fetchLoggingStatus = async () => {
      try {
        const response = await fetch(`${API_URL}/api/logging/status`)
        if (!response.ok) throw new Error('Failed to fetch logging status')
        const data = await response.json()
        setLoggingEnabled(data.logging_enabled)
      } catch (error) {
        console.error('Error fetching logging status:', error)
      }
    }
    fetchLoggingStatus()
  }, [])

  const toggleLogging = async () => {
    try {
      const response = await fetch(`${API_URL}/api/logging/toggle`, {
        method: 'POST'
      })
      if (!response.ok) throw new Error('Failed to toggle logging')
      const data = await response.json()
      setLoggingEnabled(data.logging_enabled)
    } catch (error) {
      console.error('Error toggling logging:', error)
    }
  }

  const theme: ThemeColors = darkMode ? {
    background: '#1a1a1a',
    cardBackground: '#2d2d2d',
    text: '#e1e1e1',
    subtext: '#a0a0a0',
    border: '#404040',
    progressBackground: '#404040',
    isDark: true
  } : {
    background: '#f8f9fa',
    cardBackground: '#ffffff',
    text: '#2c3e50',
    subtext: '#666666',
    border: '#e1e4e8',
    progressBackground: '#e9ecef',
    isDark: false
  }

  useEffect(() => {
    localStorage.setItem('darkMode', JSON.stringify(darkMode))
  }, [darkMode])

  useEffect(() => {
    console.log('Polling interval set to:', pollingInterval);
    localStorage.setItem('pollingInterval', pollingInterval.toString())
  }, [pollingInterval])

  useEffect(() => {
    const fetchData = async () => {
      const startTime = Date.now();
      console.log('Starting fetch at:', new Date(startTime).toLocaleTimeString());
      try {
        console.log('Fetching from:', `${API_URL}/api/gpu-stats`);
        const response = await fetch(`${API_URL}/api/gpu-stats`)
        if (!response.ok) {
          console.error('Response not OK:', response.status, response.statusText);
          const errorData = await response.json()
          throw new Error(errorData.detail || `HTTP error! Status: ${response.status}`);
        }
        const jsonData = await response.json()
        console.log('Response data:', jsonData);
        setData(jsonData)
        setError(null)
      } catch (error) {
        console.error('Fetch error:', error);
        setError(error instanceof Error ? error.message : 'Failed to fetch GPU data')
      }
    }

    console.log('Setting up data fetch with interval:', pollingInterval);
    fetchData()
    const interval = setInterval(fetchData, pollingInterval)
    return () => clearInterval(interval)
  }, [pollingInterval])

  useEffect(() => {
    const pulseAnimation = `
      @keyframes pulse {
        0% { opacity: 1; }
        50% { opacity: 0.5; }
        100% { opacity: 1; }
      }
    `

    const styleSheet = document.createElement('style')
    styleSheet.textContent = pulseAnimation
    document.head.appendChild(styleSheet)

    return () => {
      document.head.removeChild(styleSheet)
    }
  }, [])

  if (error) {
    return (
      <div style={{ 
        padding: '20px', 
        maxWidth: '800px', 
        margin: '40px auto',
        backgroundColor: theme.cardBackground,
        color: theme.text,
        borderRadius: '12px',
        border: `1px solid ${theme.border}`,
        boxShadow: '0 4px 6px rgba(0,0,0,0.1)'
      }}>
        <div style={{
          display: 'flex',
          alignItems: 'center',
          gap: '16px',
          marginBottom: '24px'
        }}>
          <svg width="32" height="32" viewBox="0 0 24 24" fill={getColorScheme(theme.isDark).warning[theme.isDark ? 'dark' : 'light']}>
            <path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"/>
          </svg>
          <h2 style={{ margin: 0, color: theme.text }}>GPU Monitoring Unavailable</h2>
        </div>
        
        <div style={{
          backgroundColor: theme.background,
          padding: '16px',
          borderRadius: '8px',
          marginBottom: '24px',
          fontFamily: 'monospace',
          fontSize: '14px',
          color: theme.subtext
        }}>
          {error}
        </div>

        <div style={{
          borderTop: `1px solid ${theme.border}`,
          paddingTop: '20px'
        }}>
          <h3 style={{ marginTop: 0, color: theme.text }}>Troubleshooting Steps:</h3>
          <ul style={{ color: theme.text, lineHeight: 1.6 }}>
            <li>Verify NVIDIA drivers are installed: <code style={{ backgroundColor: theme.background, padding: '2px 6px', borderRadius: '4px' }}>nvidia-smi</code></li>
            <li>Check GPU connection and power supply</li>
            <li>Ensure CUDA toolkit is properly installed</li>
            <li>Verify user permissions for GPU access</li>
            <li>Check system logs for driver errors</li>
          </ul>
          <button
            onClick={() => window.location.reload()}
            style={{
              padding: '8px 16px',
              backgroundColor: getColorScheme(theme.isDark).good[theme.isDark ? 'dark' : 'light'],
              color: '#fff',
              border: 'none',
              borderRadius: '6px',
              cursor: 'pointer',
              display: 'flex',
              alignItems: 'center',
              gap: '8px',
              marginTop: '16px'
            }}
          >
            <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
              <path d="M17.65 6.35C16.2 4.9 14.21 4 12 4c-4.42 0-7.99 3.58-7.99 8s3.57 8 7.99 8c3.73 0 6.84-2.55 7.73-6h-2.08c-.82 2.33-3.04 4-5.65 4-3.31 0-6-2.69-6-6s2.69-6 6-6c1.66 0 3.14.69 4.22 1.78L13 11h7V4l-2.35 2.35z"/>
            </svg>
            Retry Connection
          </button>
        </div>
      </div>
    )
  }

  if (!data) {
    return (
      <div style={{ 
        padding: '20px', 
        maxWidth: '800px', 
        margin: '40px auto',
        textAlign: 'center',
        color: theme.text
      }}>
        <div style={{
          display: 'inline-block',
          width: '40px',
          height: '40px',
          border: `4px solid ${theme.border}`,
          borderTopColor: getColorScheme(theme.isDark).good[theme.isDark ? 'dark' : 'light'],
          borderRadius: '50%',
          animation: 'spin 1s linear infinite'
        }} />
        <style>
          {`
            @keyframes spin {
              to { transform: rotate(360deg); }
            }
          `}
        </style>
        <p style={{ marginTop: '16px' }}>Connecting to GPU Monitoring Service...</p>
      </div>
    )
  }

  const getMetricColor = (value: number, theme: ThemeColors): string => {
    const colors = getColorScheme(theme.isDark);
    if (value >= 90) return theme.isDark ? colors.critical.dark : colors.critical.light;
    if (value >= 75) return theme.isDark ? colors.warning.dark : colors.warning.light;
    if (value >= 50) return theme.isDark ? colors.caution.dark : colors.caution.light;
    if (value >= 25) return theme.isDark ? colors.good.dark : colors.good.light;
    return theme.isDark ? colors.ideal.dark : colors.ideal.light;
  }

  const getTemperatureColor = (temp: number): string => {
    const colors = getColorScheme(theme.isDark);
    if (temp >= 80) return theme.isDark ? colors.critical.dark : colors.critical.light;
    if (temp >= 70) return theme.isDark ? colors.warning.dark : colors.warning.light;
    if (temp >= 60) return theme.isDark ? colors.caution.dark : colors.caution.light;
    if (temp >= 50) return theme.isDark ? colors.good.dark : colors.good.light;
    return theme.isDark ? colors.ideal.dark : colors.ideal.light;
  }

  const getUtilizationColor = (utilization: number, theme: ThemeColors): string => {
    return getMetricColor(utilization, theme);
  }

  const getFanSpeedColor = (speed: number, theme: ThemeColors): string => {
    const colors = getColorScheme(theme.isDark);
    if (speed > 80) return theme.isDark ? colors.critical.dark : colors.critical.light;
    if (speed > 65) return theme.isDark ? colors.warning.dark : colors.warning.light;
    if (speed > 50) return theme.isDark ? colors.caution.dark : colors.caution.light;
    if (speed > 35) return theme.isDark ? colors.good.dark : colors.good.light;
    return theme.isDark ? colors.ideal.dark : colors.ideal.light;
  }

  const getTemperatureIcon = (rate: number): { icon: string; color: string } => {
    const colors = getColorScheme(theme.isDark);
    if (Math.abs(rate) < 1.0) return { icon: '', color: theme.text };
    return rate > 0 
      ? { icon: '⌃', color: theme.isDark ? colors.critical.dark : colors.critical.light }  // Rising temp
      : { icon: '⌄', color: theme.isDark ? colors.good.dark : colors.good.light };         // Falling temp
  }

  return (
    <div style={{ 
      padding: '20px', 
      maxWidth: '1200px', 
      margin: '0 auto', 
      fontFamily: 'system-ui, -apple-system, sans-serif',
      backgroundColor: theme.background,
      color: theme.text,
      minHeight: '100vh'
    }}>
      <div style={{ 
        display: 'flex', 
        justifyContent: 'space-between', 
        alignItems: 'center', 
        marginBottom: '20px',
        flexWrap: 'wrap',
        gap: '10px'
      }}>
        <div style={{ display: 'flex', alignItems: 'center', gap: '15px', flexWrap: 'wrap' }}>
          <h1 style={{ margin: 0, color: theme.text }}>
            NVIDIA-SMI {data.nvidia_info.driver_version}
          </h1>
          <div style={{ 
            display: 'flex', 
            gap: '20px',
            color: theme.subtext,
            fontSize: '1rem',
            fontFamily: 'monospace',
            flexWrap: 'wrap',
            alignItems: 'center'
          }}>
            <span style={{ 
              padding: '2px 6px',
              backgroundColor: theme.cardBackground,
              border: `1px solid ${theme.border}`,
              borderRadius: '4px',
              fontSize: '0.9em',
              display: 'flex',
              gap: '6px'
            }}>
              <span style={{ color: theme.subtext }}>Driver:</span>
              {data.nvidia_info.driver_version}
            </span>
            <span style={{ 
              padding: '2px 6px',
              backgroundColor: theme.cardBackground,
              border: `1px solid ${theme.border}`,
              borderRadius: '4px',
              fontSize: '0.9em',
              display: 'flex',
              gap: '6px'
            }}>
              <span style={{ color: theme.subtext }}>CUDA:</span>
              12.2
            </span>
          </div>
        </div>
        <div style={{ display: 'flex', gap: '10px', alignItems: 'center' }}>
          <select
            value={pollingInterval}
            onChange={(e) => setPollingInterval(parseInt(e.target.value))}
            style={{
              padding: '8px 12px',
              borderRadius: '8px',
              border: `1px solid ${theme.border}`,
              backgroundColor: theme.cardBackground,
              color: theme.text,
              cursor: 'pointer',
              fontSize: '14px'
            }}
          >
            {POLLING_INTERVALS.map(interval => (
              <option key={interval.value} value={interval.value}>
                Update every {interval.label}
              </option>
            ))}
          </select>
          <button
            onClick={() => setDarkMode(!darkMode)}
            style={{
              padding: '8px 12px',
              borderRadius: '8px',
              border: `1px solid ${theme.border}`,
              backgroundColor: theme.cardBackground,
              color: theme.text,
              cursor: 'pointer',
              display: 'flex',
              alignItems: 'center',
              gap: '5px'
            }}
          >
            {darkMode ? (
              <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
                <path d="M12 7c-2.76 0-5 2.24-5 5s2.24 5 5 5 5-2.24 5-5-2.24-5-5-5zM2 13h2c.55 0 1-.45 1-1s-.45-1-1-1H2c-.55 0-1 .45-1 1s.45 1 1 1zm18 0h2c.55 0 1-.45 1-1s-.45-1-1-1h-2c-.55 0-1 .45-1 1s.45 1 1 1zM11 2v2c0 .55.45 1 1 1s1-.45 1-1V2c0-.55-.45-1-1-1s-1 .45-1 1zm0 18v2c0 .55.45 1 1 1s1-.45 1-1v-2c0-.55-.45-1-1-1s-1 .45-1 1zM5.99 4.58c-.39-.39-1.03-.39-1.41 0-.39.39-.39 1.03 0 1.41l1.06 1.06c.39.39 1.03.39 1.41 0s.39-1.03 0-1.41L5.99 4.58zm12.37 12.37c-.39-.39-1.03-.39-1.41 0-.39.39-.39 1.03 0 1.41l1.06 1.06c.39.39 1.03.39 1.41 0 .39-.39.39-1.03 0-1.41l-1.06-1.06zm1.06-10.96c.39-.39.39-1.03 0-1.41-.39-.39-1.03-.39-1.41 0l-1.06 1.06c-.39.39-.39 1.03 0 1.41s1.03.39 1.41 0l1.06-1.06zM7.05 18.36c.39-.39.39-1.03 0-1.41-.39-.39-1.03-.39-1.41 0l-1.06 1.06c-.39.39-.39 1.03 0 1.41s1.03.39 1.41 0l1.06-1.06z"/>
              </svg>
            ) : (
              <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
                <path d="M12 3c-4.97 0-9 4.03-9 9s4.03 9 9 9 9-4.03 9-9c0-.46-.04-.92-.1-1.36-.98 1.37-2.58 2.26-4.4 2.26-3.03 0-5.5-2.47-5.5-5.5 0-1.82.89-3.42 2.26-4.4-.44-.06-.9-.1-1.36-.1z"/>
              </svg>
            )}
            {darkMode ? 'Light Mode' : 'Dark Mode'}
          </button>
          <button
            onClick={toggleLogging}
            style={{
              padding: '8px 12px',
              borderRadius: '8px',
              border: `1px solid ${theme.border}`,
              backgroundColor: theme.cardBackground,
              color: loggingEnabled ? getMetricColor(90, theme) : theme.subtext,
              cursor: 'pointer',
              display: 'flex',
              alignItems: 'center',
              gap: '5px'
            }}
          >
            <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
              <path d="M14 12c0-1.1-.9-2-2-2s-2 .9-2 2 .9 2 2 2 2-.9 2-2zm-2-9c-4.97 0-9 4.03-9 9H0l4 4 4-4H5c0-3.87 3.13-7 7-7s7 3.13 7 7-3.13 7-7 7c-1.51 0-2.91-.49-4.06-1.3l-1.42 1.44C8.04 20.3 9.94 21 12 21c4.97 0 9-4.03 9-9s-4.03-9-9-9z"/>
            </svg>
            {loggingEnabled ? 'Pause Logging' : 'Resume Logging'}
          </button>
        </div>
      </div>
      
      {/* GPU Cards */}
      {data.gpus.map(gpu => {
        const memoryPercentage = (gpu.memory_used / gpu.memory_total) * 100
        const powerPercentage = Math.min((gpu.power_draw / 250) * 100, 100) // Assuming max power is 250W

        return (
          <div key={gpu.index} style={{ 
            border: `1px solid ${theme.border}`,
            padding: '20px',
            margin: '20px 0',
            borderRadius: '8px',
            backgroundColor: theme.cardBackground,
            boxShadow: '0 2px 4px rgba(0,0,0,0.1)'
          }}>
            <div style={{ 
              display: 'flex', 
              alignItems: 'center', 
              gap: '10px',
              marginBottom: '20px'
            }}>
              <h2 style={{ margin: 0, color: theme.text }}>{gpu.name}</h2>
              <div style={{ display: 'flex', gap: '10px', alignItems: 'center' }}>
                <span style={{ 
                  fontSize: '0.9rem', 
                  padding: '3px 8px', 
                  backgroundColor: theme.cardBackground,
                  border: `1px solid ${theme.border}`,
                  borderRadius: '4px',
                  color: theme.subtext
                }}>
                  GPU #{gpu.index}
                </span>
                <span style={{ 
                  fontSize: '0.9rem', 
                  padding: '3px 8px', 
                  backgroundColor: theme.cardBackground,
                  border: `1px solid ${theme.border}`,
                  borderRadius: '4px',
                  color: theme.subtext
                }}>
                  {(gpu.memory_total / 1024).toFixed(1)} GB
                </span>
              </div>
            </div>
            
            <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(200px, 1fr))', gap: '20px' }}>
              {/* GPU Stats */}
              <div style={{ display: 'flex', gap: '16px', flexWrap: 'wrap' }}>
                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>GPU Utilization</div>
                  <div 
                    role="progressbar" 
                    aria-valuenow={gpu.gpu_utilization}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} utilization: ${gpu.gpu_utilization}%`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${gpu.gpu_utilization}%`,
                      height: '100%',
                      backgroundColor: getUtilizationColor(gpu.gpu_utilization, theme),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    color: getUtilizationColor(gpu.gpu_utilization, theme)
                  }}>{gpu.gpu_utilization}%</div>
                </div>

                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>Memory Usage</div>
                  <div 
                    role="progressbar"
                    aria-valuenow={(gpu.memory_used / gpu.memory_total) * 100}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} memory usage: ${(gpu.memory_used / gpu.memory_total * 100).toFixed(1)}%`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${(gpu.memory_used / gpu.memory_total) * 100}%`,
                      height: '100%',
                      backgroundColor: getUtilizationColor((gpu.memory_used / gpu.memory_total) * 100, theme),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    display: 'flex',
                    justifyContent: 'flex-end',
                    alignItems: 'center',
                    gap: '4px'
                  }}>
                    <span style={{
                      color: getUtilizationColor((gpu.memory_used / gpu.memory_total) * 100, theme)
                    }}>
                      {(gpu.memory_used / 1024).toFixed(1)}GB
                    </span>
                    <span style={{
                      color: theme.subtext
                    }}>
                      / {(gpu.memory_total / 1024).toFixed(1)}GB
                    </span>
                  </div>
                </div>

                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>Temperature</div>
                  <div 
                    role="progressbar"
                    aria-valuenow={gpu.temperature}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} temperature: ${gpu.temperature}°C, ${Math.round(gpu.temperature * 9/5 + 32)}°F`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${Math.min((gpu.temperature / 100) * 100, 100)}%`,
                      height: '100%',
                      backgroundColor: getTemperatureColor(gpu.temperature),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    display: 'flex',
                    justifyContent: 'flex-end',
                    alignItems: 'center',
                    gap: '8px'
                  }}>
                    <span style={{ 
                      color: getTemperatureColor(gpu.temperature),
                      fontSize: '1.1em',
                      fontWeight: 500,
                      display: 'flex',
                      alignItems: 'center',
                      gap: '4px'
                    }}>
                      {Math.abs(gpu.temp_change_rate) >= 1.0 && (
                        <span style={{ 
                          color: getTemperatureIcon(gpu.temp_change_rate).color,
                          fontSize: '1.5em',
                          fontWeight: 'bold',
                          display: 'flex',
                          alignItems: 'center',
                          marginRight: '2px'
                        }}>
                          {getTemperatureIcon(gpu.temp_change_rate).icon}
                        </span>
                      )}
                      {Math.round(gpu.temperature)}°C
                    </span>
                    <span style={{ 
                      color: theme.subtext,
                      fontSize: '1.0em'
                    }}>
                      / {Math.round(gpu.temperature * 9/5 + 32)}°F
                    </span>
                    <div style={{ 
                      fontSize: '1.0rem', 
                      color: theme.subtext 
                    }}>
                      Peak: {gpu.peak_temperature}°C
                    </div>
                  </div>
                </div>

                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>Fan Speed</div>
                  <div 
                    role="progressbar"
                    aria-valuenow={gpu.fan_speed}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} fan speed: ${gpu.fan_speed}%`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${gpu.fan_speed}%`,
                      height: '100%',
                      backgroundColor: getFanSpeedColor(gpu.fan_speed, theme),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    color: getFanSpeedColor(gpu.fan_speed, theme)
                  }}>{gpu.fan_speed}%</div>
                </div>
              </div>
            </div>
            
            {/* GPU Burn Status */}
            {data.gpu_burn_metrics.running && (
              <div style={{
                marginTop: '20px',
                padding: '15px',
                backgroundColor: theme.cardBackground,
                borderRadius: '8px',
                border: `1px solid ${theme.border}`
              }}>
                <div style={{
                  display: 'flex',
                  justifyContent: 'space-between',
                  alignItems: 'center',
                  marginBottom: '10px'
                }}>
                  <div style={{
                    display: 'flex',
                    alignItems: 'center',
                    gap: '10px'
                  }}>
                    <span style={{
                      width: '10px',
                      height: '10px',
                      borderRadius: '50%',
                      backgroundColor: '#ff4444',
                      animation: 'pulse 2s infinite'
                    }} />
                    <span style={{ fontWeight: 500 }}>GPU Burn Test Running</span>
                  </div>
                  <div style={{ color: theme.subtext }}>
                    Duration: {Math.floor(data.gpu_burn_metrics.duration / 60)}m {Math.round(data.gpu_burn_metrics.duration % 60)}s
                  </div>
                </div>
                {data.gpu_burn_metrics.errors > 0 && (
                  <div style={{ color: '#ff4444', marginTop: '10px' }}>
                    ⚠️ {data.gpu_burn_metrics.errors} computation errors detected
                  </div>
                )}
              </div>
            )}
          </div>
        )
      })}

      {/* Running Processes */}
      {data.processes.length > 0 && (
        <div style={{ 
          border: `1px solid ${theme.border}`,
          padding: '20px',
          marginTop: '20px',
          borderRadius: '8px',
          backgroundColor: theme.cardBackground,
          boxShadow: '0 1px 3px rgba(0,0,0,0.1)'
        }}>
          <h3 style={{ marginTop: 0, color: theme.text }}>Running Processes</h3>
          <div style={{ 
            display: 'grid', 
            gridTemplateColumns: 'repeat(auto-fit, minmax(300px, 1fr))', 
            gap: '10px' 
          }}>
            {data.processes
              .filter(proc => proc.name.toLowerCase() !== 'unknown')
              .map((proc, idx) => (
                <div key={`${proc.pid}-${idx}`} style={{ 
                  padding: '10px',
                  border: `1px solid ${theme.border}`,
                  borderRadius: '8px',
                  backgroundColor: theme.cardBackground
                }}>
                  <div style={{ 
                    display: 'flex', 
                    justifyContent: 'space-between',
                    alignItems: 'center'
                  }}>
                    <div>
                      <div style={{ fontWeight: '600', color: theme.text }}>{proc.name}</div>
                      <div style={{ fontSize: '12px', color: theme.subtext }}>PID: {proc.pid}</div>
                    </div>
                    <div style={{ color: theme.text }}>{proc.used_memory} MB</div>
                  </div>
                </div>
              ))}
          </div>
        </div>
      )}
    </div>
  )
}

export default App

↑ Back to top

frontend/src/components/AlertsPanel.tsx (3.3 KiB)

import React from 'react';
import {
  Paper,
  List,
  ListItem,
  ListItemText,
  Typography,
  Box,
  Chip,
  IconButton,
} from '@mui/material';
import {
  Warning as WarningIcon,
  Error as ErrorIcon,
  CheckCircle as CheckCircleIcon,
  Delete as DeleteIcon,
} from '@mui/icons-material';
import { format } from 'date-fns';

interface Alert {
  id: string;
  severity: 'warning' | 'critical' | 'resolved';
  message: string;
  timestamp: string;
  gpuIndex: number;
  metricName: string;
  value: number;
  threshold: number;
}

interface AlertsPanelProps {
  alerts: Alert[];
  onDismiss?: (alertId: string) => void;
}

const AlertsPanel: React.FC<AlertsPanelProps> = ({ alerts, onDismiss }) => {
  const getSeverityIcon = (severity: Alert['severity']) => {
    switch (severity) {
      case 'warning':
        return <WarningIcon sx={{ color: 'warning.main' }} />;
      case 'critical':
        return <ErrorIcon sx={{ color: 'error.main' }} />;
      case 'resolved':
        return <CheckCircleIcon sx={{ color: 'success.main' }} />;
    }
  };

  const getSeverityColor = (severity: Alert['severity']) => {
    switch (severity) {
      case 'warning':
        return 'warning';
      case 'critical':
        return 'error';
      case 'resolved':
        return 'success';
    }
  };

  return (
    <Paper sx={{ maxHeight: 400, overflow: 'auto', p: 2 }}>
      <Typography variant="h6" gutterBottom>
        Alerts
      </Typography>
      <List>
        {alerts.map((alert) => (
          <ListItem
            key={alert.id}
            sx={{
              mb: 1,
              border: 1,
              borderColor: 'divider',
              borderRadius: 1,
            }}
            secondaryAction={
              onDismiss && (
                <IconButton
                  edge="end"
                  aria-label="dismiss"
                  onClick={() => onDismiss(alert.id)}
                >
                  <DeleteIcon />
                </IconButton>
              )
            }
          >
            <Box sx={{ mr: 2 }}>{getSeverityIcon(alert.severity)}</Box>
            <ListItemText
              primary={
                <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
                  <Typography variant="body1">{alert.message}</Typography>
                  <Chip
                    label={`GPU ${alert.gpuIndex}`}
                    size="small"
                    color={getSeverityColor(alert.severity)}
                    variant="outlined"
                  />
                </Box>
              }
              secondary={
                <>
                  <Typography variant="body2" color="text.secondary">
                    {format(new Date(alert.timestamp), 'MMM d, yyyy HH:mm:ss')}
                  </Typography>
                  <Typography variant="body2" color="text.secondary">
                    {alert.metricName}: {alert.value} (threshold: {alert.threshold})
                  </Typography>
                </>
              }
            />
          </ListItem>
        ))}
        {alerts.length === 0 && (
          <ListItem>
            <ListItemText
              primary={
                <Typography color="text.secondary">No active alerts</Typography>
              }
            />
          </ListItem>
        )}
      </List>
    </Paper>
  );
};

export default AlertsPanel;

↑ Back to top

frontend/src/components/MetricsGrid.tsx (4.9 KiB)

import React from 'react';
import {
  Grid,
  Paper,
  Typography,
  LinearProgress,
  Box,
  IconButton,
} from '@mui/material';
import {
  Memory as MemoryIcon,
  Speed as SpeedIcon,
  Thermostat as ThermostatIcon,
  Settings as SettingsIcon,
} from '@mui/icons-material';

interface GPUMetrics {
  index: number;
  name: string;
  utilization: number;
  memoryUsed: number;
  memoryTotal: number;
  temperature: number;
  fanSpeed: number;
  powerDraw: number;
  powerLimit: number;
}

interface MetricsGridProps {
  gpus: GPUMetrics[];
  onConfigureGPU?: (gpuIndex: number) => void;
}

const MetricsGrid: React.FC<MetricsGridProps> = ({ gpus, onConfigureGPU }) => {
  const getUtilizationColor = (value: number) => {
    if (value < 50) return 'success.main';
    if (value < 80) return 'warning.main';
    return 'error.main';
  };

  const getTemperatureColor = (value: number) => {
    if (value < 60) return 'success.main';
    if (value < 80) return 'warning.main';
    return 'error.main';
  };

  return (
    <Grid container spacing={3}>
      {gpus.map((gpu) => (
        <Grid item xs={12} md={6} lg={4} key={gpu.index}>
          <Paper sx={{ p: 2 }}>
            <Box sx={{ display: 'flex', justifyContent: 'space-between', mb: 2 }}>
              <Typography variant="h6">
                GPU {gpu.index}: {gpu.name}
              </Typography>
              {onConfigureGPU && (
                <IconButton
                  size="small"
                  onClick={() => onConfigureGPU(gpu.index)}
                >
                  <SettingsIcon />
                </IconButton>
              )}
            </Box>

            {/* Utilization */}
            <Box sx={{ mb: 2 }}>
              <Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
                <SpeedIcon sx={{ mr: 1 }} />
                <Typography variant="body2">Utilization</Typography>
              </Box>
              <Box sx={{ display: 'flex', alignItems: 'center' }}>
                <Box sx={{ flexGrow: 1, mr: 1 }}>
                  <LinearProgress
                    variant="determinate"
                    value={gpu.utilization}
                    sx={{
                      height: 10,
                      borderRadius: 5,
                      backgroundColor: 'action.hover',
                      '& .MuiLinearProgress-bar': {
                        backgroundColor: getUtilizationColor(gpu.utilization),
                      },
                    }}
                  />
                </Box>
                <Typography variant="body2">{gpu.utilization}%</Typography>
              </Box>
            </Box>

            {/* Memory */}
            <Box sx={{ mb: 2 }}>
              <Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
                <MemoryIcon sx={{ mr: 1 }} />
                <Typography variant="body2">Memory</Typography>
              </Box>
              <Box sx={{ display: 'flex', alignItems: 'center' }}>
                <Box sx={{ flexGrow: 1, mr: 1 }}>
                  <LinearProgress
                    variant="determinate"
                    value={(gpu.memoryUsed / gpu.memoryTotal) * 100}
                    sx={{
                      height: 10,
                      borderRadius: 5,
                    }}
                  />
                </Box>
                <Typography variant="body2">
                  {gpu.memoryUsed}/{gpu.memoryTotal} GB
                </Typography>
              </Box>
            </Box>

            {/* Temperature */}
            <Box sx={{ mb: 2 }}>
              <Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
                <ThermostatIcon sx={{ mr: 1 }} />
                <Typography variant="body2">Temperature</Typography>
              </Box>
              <Box sx={{ display: 'flex', alignItems: 'center' }}>
                <Box sx={{ flexGrow: 1, mr: 1 }}>
                  <LinearProgress
                    variant="determinate"
                    value={(gpu.temperature / 100) * 100}
                    sx={{
                      height: 10,
                      borderRadius: 5,
                      backgroundColor: 'action.hover',
                      '& .MuiLinearProgress-bar': {
                        backgroundColor: getTemperatureColor(gpu.temperature),
                      },
                    }}
                  />
                </Box>
                <Typography variant="body2">{gpu.temperature}°C</Typography>
              </Box>
            </Box>

            {/* Power Usage */}
            <Box>
              <Typography variant="body2" color="text.secondary">
                Power: {gpu.powerDraw}W / {gpu.powerLimit}W
              </Typography>
              <Typography variant="body2" color="text.secondary">
                Fan Speed: {gpu.fanSpeed}%
              </Typography>
            </Box>
          </Paper>
        </Grid>
      ))}
    </Grid>
  );
};

export default MetricsGrid;

↑ Back to top

frontend/src/components/TimeSeriesChart.tsx (1.8 KiB)

import React from 'react';
import {
  LineChart,
  Line,
  XAxis,
  YAxis,
  CartesianGrid,
  Tooltip,
  Legend,
  ResponsiveContainer,
} from 'recharts';
import { format } from 'date-fns';
import { Box, Typography, useTheme } from '@mui/material';

interface TimeSeriesProps {
  data: Array<{
    timestamp: string;
    value: number;
  }>;
  title: string;
  dataKey: string;
  color?: string;
  unit?: string;
}

const TimeSeriesChart: React.FC<TimeSeriesProps> = ({
  data,
  title,
  dataKey,
  color = '#8884d8',
  unit = '',
}) => {
  const theme = useTheme();

  const formatXAxis = (tickItem: string) => {
    return format(new Date(tickItem), 'HH:mm:ss');
  };

  const formatTooltip = (value: number) => {
    return `${value}${unit}`;
  };

  return (
    <Box sx={{ width: '100%', height: 300, p: 2 }}>
      <Typography variant="h6" gutterBottom>
        {title}
      </Typography>
      <ResponsiveContainer>
        <LineChart
          data={data}
          margin={{
            top: 5,
            right: 30,
            left: 20,
            bottom: 5,
          }}
        >
          <CartesianGrid strokeDasharray="3 3" />
          <XAxis
            dataKey="timestamp"
            tickFormatter={formatXAxis}
            stroke={theme.palette.text.primary}
          />
          <YAxis stroke={theme.palette.text.primary} />
          <Tooltip
            labelFormatter={(label: string) => format(new Date(label), 'HH:mm:ss')}
            formatter={(value: number) => [formatTooltip(value), title]}
          />
          <Legend />
          <Line
            type="monotone"
            dataKey={dataKey}
            stroke={color}
            dot={false}
            activeDot={{ r: 8 }}
          />
        </LineChart>
      </ResponsiveContainer>
    </Box>
  );
};

export default TimeSeriesChart;

↑ Back to top

frontend/src/index.css (599 B)

:root {
  font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
  line-height: 1.5;
  font-weight: 400;

  font-synthesis: none;
  text-rendering: optimizeLegibility;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

body {
  margin: 0;
  min-width: 320px;
  min-height: 100vh;
}

#root {
  width: 100%;
  min-height: 100vh;
}

@keyframes spin {
  to { transform: rotate(360deg); }
}

@keyframes pulse {
  0% { opacity: 1; }
  50% { opacity: 0.5; }
  100% { opacity: 1; }
}

↑ Back to top

frontend/src/main.tsx (210 B)

import React from 'react'
import ReactDOM from 'react-dom/client'
import App from './App'

ReactDOM.createRoot(document.getElementById('root')!).render(
  <React.StrictMode>
    <App />
  </React.StrictMode>
)

↑ Back to top

frontend/src/vite-env.d.ts (38 B)

/// <reference types="vite/client" />

↑ Back to top

frontend/stop_frontend.sh (404 B)

#!/bin/bash

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PID_FILE="$DIR/frontend.pid"

if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p $PID > /dev/null; then
        echo "Stopping Frontend (PID: $PID)"
        kill $PID
        rm "$PID_FILE"
    else
        echo "Frontend not running (stale PID file)"
        rm "$PID_FILE"
    fi
else
    echo "No PID file found"
fi

↑ Back to top

frontend/tsconfig.app.json (813 B)

{
  "extends": "./tsconfig.json",
  "compilerOptions": {
    "composite": true,
    "outDir": "./dist/app",
    "declaration": true,
    "declarationDir": "./dist/app/types",
    "rootDir": "src",
    "target": "ES2020",
    "useDefineForClassFields": true,
    "lib": ["ES2020", "DOM", "DOM.Iterable"],
    "module": "ESNext",
    "skipLibCheck": true,
    "moduleResolution": "bundler",
    "allowImportingTsExtensions": true,
    "resolveJsonModule": true,
    "isolatedModules": true,
    "noEmit": false,
    "jsx": "react-jsx",
    "strict": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "noFallthroughCasesInSwitch": true
  },
  "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.d.ts"],
  "exclude": ["src/**/__tests__/*"],
  "references": [{ "path": "./tsconfig.node.json" }]
}

↑ Back to top

frontend/tsconfig.json (679 B)

{
  "compilerOptions": {
    "target": "ES2020",
    "lib": ["ES2020", "DOM", "DOM.Iterable"],
    "module": "ESNext",
    "skipLibCheck": true,
    "moduleResolution": "bundler",
    "allowImportingTsExtensions": true,
    "resolveJsonModule": true,
    "isolatedModules": true,
    "jsx": "react-jsx",
    "strict": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "noFallthroughCasesInSwitch": true,
    "declaration": true,
    "declarationDir": "./dist/types",
    "emitDeclarationOnly": true
  },
  "include": ["src"],
  "exclude": ["node_modules"],
  "references": [
    { "path": "./tsconfig.app.json" },
    { "path": "./tsconfig.node.json" }
  ]
}

↑ Back to top

frontend/tsconfig.node.json (330 B)

{
  "compilerOptions": {
    "composite": true,
    "skipLibCheck": true,
    "module": "ESNext",
    "moduleResolution": "bundler",
    "allowSyntheticDefaultImports": true,
    "strict": true,
    "outDir": "./dist/node",
    "declaration": true,
    "declarationDir": "./dist/node/types"
  },
  "include": ["vite.config.ts"]
}

↑ Back to top

frontend/vite.config.ts (196 B)

import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'

export default defineConfig({
  plugins: [react()],
  server: {
    port: 5175,
    host: true,
    hmr: false
  }
})

↑ Back to top

frontend.pid (7 B)

↑ Back to top

requirements.txt (98 B)

supabase==1.0.3
python-dotenv==1.0.0
pydantic==2.5.1
pytest==7.4.3
requests==2.32.3
psutil>=5.9.0

↑ Back to top

run.sh (1.5 KiB)

#!/bin/bash

# Function to check if a port is in use
check_port() {
    if lsof -Pi :$1 -sTCP:LISTEN -t >/dev/null ; then
        echo "Port $1 is already in use"
        return 1
    fi
    return 0
}

# Function to kill process on port
kill_port() {
    if lsof -Pi :$1 -sTCP:LISTEN -t >/dev/null ; then
        echo "Killing process on port $1..."
        lsof -ti:$1 | xargs kill -9
    fi
}

# Check if virtual environment exists
if [ ! -d "venv" ]; then
    echo "Creating virtual environment..."
    python3 -m venv venv
fi

# Activate virtual environment
source venv/bin/activate

# Install backend requirements
echo "Installing backend requirements..."
pip install -r requirements.txt

# Install frontend dependencies if needed
if [ ! -d "frontend/node_modules" ]; then
    echo "Installing frontend dependencies..."
    cd frontend && npm install && cd ..
fi

# Kill any existing processes on our ports
kill_port 5183  # FastAPI
kill_port 5173  # Vite dev server

# Start all components in background
echo "Starting GPU stats collector..."
python -m src.collector.collector &

echo "Starting FastAPI server..."
cd backend && python -m src.service.app &

echo "Starting frontend dev server..."
cd frontend && npm run dev &

# Wait for servers to start
sleep 3

echo "
🚀 GPU Sentinel Pro is running!

📊 Frontend: http://localhost:5173
🔧 API & Docs: http://localhost:5183
📘 API Documentation:
   - Swagger UI: http://localhost:5183/docs
   - ReDoc: http://localhost:5183/redoc

Press Ctrl+C to stop all services
"

# Wait for Ctrl+C
trap 'kill $(jobs -p)' INT
wait

↑ Back to top

sonar-project.properties (761 B)

# Project information
sonar.projectKey=jackccrawford_gpu-sentinel-pro
sonar.organization=jackccrawford
sonar.projectName=GPU Sentinel Pro
sonar.projectVersion=1.0

# Source code location
sonar.sources=backend/src,frontend/src
sonar.tests=backend/tests,frontend/src/**/*.test.tsx

# Language settings
sonar.python.version=3.11
sonar.typescript.node=20

# Coverage reports
sonar.python.coverage.reportPaths=backend/coverage-reports/coverage.xml
sonar.javascript.lcov.reportPaths=frontend/coverage/lcov.info

# Encoding of source files
sonar.sourceEncoding=UTF-8

# Analysis settings
sonar.exclusions=**/node_modules/**,**/*.pyc,**/__pycache__/**
sonar.coverage.exclusions=**/*.test.tsx,**/*.spec.ts,**/tests/**

# Quality gate settings
sonar.qualitygate.wait=true

↑ Back to top

src/init.py (0 B)

↑ Back to top

src/collector/init.py (0 B)

↑ Back to top

src/collector/collector.py (1.8 KiB)

import requests
import time
from datetime import datetime
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))

from src.models.gpu_metrics import GpuMetricsRecord
from src.database.client import supabase

class GpuStatsCollector:
    def __init__(self, api_url="http://localhost:5000/api/gpu-stats", interval=0.25):
        self.api_url = api_url
        self.interval = interval

    def fetch_stats(self):
        """Fetch GPU stats from the local API"""
        try:
            response = requests.get(self.api_url)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            print(f"Error fetching GPU stats: {e}")
            return None

    def collect_and_store(self):
        """Fetch stats and store them in Supabase"""
        data = self.fetch_stats()
        if data:
            try:
                metrics = GpuMetricsRecord(**data)
                result = supabase.insert_gpu_metrics(metrics)
                return result
            except Exception as e:
                print(f"Error storing metrics: {e}")
                return None

    def run_collector(self):
        """Run the collector continuously"""
        print(f"Starting GPU stats collection every {self.interval} seconds...")
        while True:
            try:
                self.collect_and_store()
                time.sleep(self.interval)
            except KeyboardInterrupt:
                print("\nStopping GPU stats collection...")
                break
            except Exception as e:
                print(f"Unexpected error: {e}")
                time.sleep(self.interval)

def main():
    collector = GpuStatsCollector()
    collector.run_collector()

if __name__ == "__main__":
    main()

↑ Back to top

supabase/.env.supabase (143 B)

POSTGRES_PASSWORD=Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
JWT_SECRET=Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u
OPERATOR_TOKEN=PIAh8fRfFPnd70DYRSuVshTI6NXNraAj

↑ Back to top

supabase/config.toml (5.9 KiB)

# A string used to distinguish different Supabase projects on the same host. Defaults to the
# working directory name when running `supabase init`.
project_id = "exponent-project-01"

[api]
enabled = true
# Port to use for the API URL.
port = 54321
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
# endpoints. public and storage are always included.
schemas = ["public", "storage", "graphql_public"]
# Extra schemas to add to the search_path of every request. public is always included.
extra_search_path = ["public", "extensions"]
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
# for accidental or malicious requests.
max_rows = 1000

[db]
# Port to use for the local database URL.
port = 54322
# Port used by db diff command to initialize the shadow database.
shadow_port = 54320
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
# server_version;` on the remote database to check.
major_version = 15

[db.pooler]
enabled = false
# Port to use for the local connection pooler.
port = 54329
# Specifies when a server connection can be reused by other clients.
# Configure one of the supported pooler modes: `transaction`, `session`.
pool_mode = "transaction"
# How many server connections to allow per user/database pair.
default_pool_size = 20
# Maximum number of client connections allowed.
max_client_conn = 100

[realtime]
enabled = true
# Bind realtime via either IPv4 or IPv6. (default: IPv6)
# ip_version = "IPv6"
# The maximum length in bytes of HTTP request headers. (default: 4096)
# max_header_length = 4096

[studio]
enabled = true
# Port to use for Supabase Studio.
port = 54323
# External URL of the API server that frontend connects to.
api_url = "http://127.0.0.1"

# Email testing server. Emails sent with the local dev setup are not actually sent - rather, they
# are monitored, and you can view the emails that would have been sent from the web interface.
[inbucket]
enabled = true
# Port to use for the email testing server web interface.
port = 54324
# Uncomment to expose additional ports for testing user applications that send emails.
# smtp_port = 54325
# pop3_port = 54326

[storage]
enabled = true
# The maximum file size allowed (e.g. "5MB", "500KB").
file_size_limit = "50MiB"

[auth]
enabled = true
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
# in emails.
site_url = "http://127.0.0.1:3000"
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
additional_redirect_urls = ["https://127.0.0.1:3000"]
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
jwt_expiry = 3600
# If disabled, the refresh token will never expire.
enable_refresh_token_rotation = true
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
# Requires enable_refresh_token_rotation = true.
refresh_token_reuse_interval = 10
# Allow/disallow new user signups to your project.
enable_signup = true
# Allow/disallow testing manual linking of accounts
enable_manual_linking = false

[auth.email]
# Allow/disallow new user signups via email to your project.
enable_signup = true
# If enabled, a user will be required to confirm any email change on both the old, and new email
# addresses. If disabled, only the new email is required to confirm.
double_confirm_changes = true
# If enabled, users need to confirm their email address before signing in.
enable_confirmations = false

# Uncomment to customize email template
# [auth.email.template.invite]
# subject = "You have been invited"
# content_path = "./supabase/templates/invite.html"

[auth.sms]
# Allow/disallow new user signups via SMS to your project.
enable_signup = true
# If enabled, users need to confirm their phone number before signing in.
enable_confirmations = false
# Template for sending OTP to users
template = "Your code is {{ .Code }} ."

# Use pre-defined map of phone number to OTP for testing.
[auth.sms.test_otp]
# 4152127777 = "123456"

# This hook runs before a token is issued and allows you to add additional claims based on the authentication method used.
[auth.hook.custom_access_token]
# enabled = true
# uri = "pg-functions://<database>/<schema>/<hook_name>"


# Configure one of the supported SMS providers: `twilio`, `twilio_verify`, `messagebird`, `textlocal`, `vonage`.
[auth.sms.twilio]
enabled = false
account_sid = ""
message_service_sid = ""
# DO NOT commit your Twilio auth token to git. Use environment variable substitution instead:
auth_token = "env(SUPABASE_AUTH_SMS_TWILIO_AUTH_TOKEN)"

# Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`,
# `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin_oidc`, `notion`, `twitch`,
# `twitter`, `slack`, `spotify`, `workos`, `zoom`.
[auth.external.apple]
enabled = false
client_id = ""
# DO NOT commit your OAuth provider secret to git. Use environment variable substitution instead:
secret = "env(SUPABASE_AUTH_EXTERNAL_APPLE_SECRET)"
# Overrides the default auth redirectUrl.
redirect_uri = ""
# Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure,
# or any other third-party OIDC providers.
url = ""

[analytics]
enabled = false
port = 54327
vector_port = 54328
# Configure one of the supported backends: `postgres`, `bigquery`.
backend = "postgres"

# Experimental features may be deprecated any time
[experimental]
# Configures Postgres storage engine to use OrioleDB (S3)
orioledb_version = ""
# Configures S3 bucket URL, eg. <bucket_name>.s3-<region>.amazonaws.com
s3_host = "env(S3_HOST)"
# Configures S3 bucket region, eg. us-east-1
s3_region = "env(S3_REGION)"
# Configures AWS_ACCESS_KEY_ID for S3 bucket
s3_access_key = "env(S3_ACCESS_KEY)"
# Configures AWS_SECRET_ACCESS_KEY for S3 bucket
s3_secret_key = "env(S3_SECRET_KEY)"

↑ Back to top

supabase/docker-compose.simple.yml (412 B)

version: '3.8'
services:
  postgres:
    image: postgres:15
    ports:
      - "54432:5432"
    environment:
      POSTGRES_PASSWORD: postgres
      POSTGRES_DB: postgres
    volumes:
      - pg_data:/var/lib/postgresql/data
      - ./init:/docker-entrypoint-initdb.d
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

volumes:
  pg_data:

↑ Back to top

supabase/docker-compose.yml (2.2 KiB)

version: '3.8'
services:
  postgres:
    image: supabase/postgres:latest
    ports:
      - "54432:5432"
    environment:
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      POSTGRES_DB: postgres
    volumes:
      - pg_data:/var/lib/postgresql/data
      - ./init:/docker-entrypoint-initdb.d
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

  studio:
    image: supabase/studio:latest
    ports:
      - "54000:3000"
    environment:
      STUDIO_PG_META_URL: http://meta:8080
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      DEFAULT_ORGANIZATION_NAME: "GPU Metrics Monitor"
      SUPABASE_URL: http://localhost:54001
    depends_on:
      - postgres
      - kong

  kong:
    image: kong:latest
    ports:
      - "54001:8000"
      - "54443:8443"
    environment:
      KONG_DATABASE: "off"
      KONG_DECLARATIVE_CONFIG: /var/lib/kong/kong.yml
      KONG_DNS_ORDER: LAST,A,CNAME
      KONG_PLUGINS: request-transformer,cors,key-auth,acl
    volumes:
      - ./kong.yml:/var/lib/kong/kong.yml:ro

  auth:
    image: supabase/gotrue:latest
    ports:
      - "54002:9999"
    environment:
      GOTRUE_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
      GOTRUE_JWT_EXP: 3600
      GOTRUE_DB_DRIVER: postgres
      DATABASE_URL: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres?sslmode=disable"
      API_EXTERNAL_URL: http://localhost:54001
      SITE_URL: http://localhost:54000
      OPERATOR_TOKEN: "PIAh8fRfFPnd70DYRSuVshTI6NXNraAj"
    depends_on:
      - postgres

  rest:
    image: postgrest/postgrest:latest
    ports:
      - "54003:3000"
    environment:
      PGRST_DB_URI: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres"
      PGRST_DB_SCHEMA: public
      PGRST_DB_ANON_ROLE: anon
      PGRST_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
    depends_on:
      - postgres

  meta:
    image: supabase/postgres-meta:latest
    ports:
      - "54004:8080"
    environment:
      PG_META_PORT: 8080
      PG_META_DB_HOST: postgres
      PG_META_DB_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
    depends_on:
      - postgres

volumes:
  pg_data:

↑ Back to top

supabase/docker-compose.yml.bak (2.4 KiB)

version: '3.8'
services:
  postgres:
    image: supabase/postgres:15.1.0.117
    ports:
      - "54432:5432"  # High port for Postgres
    environment:
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      POSTGRES_DB: postgres
    volumes:
      - pg_data:/var/lib/postgresql/data
      - ./init:/docker-entrypoint-initdb.d
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

  studio:
    image: supabase/studio:20240205-9d2d574
    ports:
      - "54000:3000"  # High port for Studio
    environment:
      STUDIO_PG_META_URL: http://meta:8080
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      DEFAULT_ORGANIZATION_NAME: "GPU Metrics Monitor"
      SUPABASE_URL: http://localhost:54001  # Match new Kong port
    depends_on:
      - postgres
      - kong

  kong:
    image: kong:2.8.1
    ports:
      - "54001:8000"  # High port for Kong HTTP
      - "54443:8443"  # High port for Kong HTTPS
    environment:
      KONG_DATABASE: "off"
      KONG_DECLARATIVE_CONFIG: /var/lib/kong/kong.yml
      KONG_DNS_ORDER: LAST,A,CNAME
      KONG_PLUGINS: request-transformer,cors,key-auth,acl
    volumes:
      - ./kong.yml:/var/lib/kong/kong.yml:ro

  auth:
    image: supabase/gotrue:v2.132.3
    ports:
      - "54002:9999"  # High port for Auth
    environment:
      GOTRUE_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
      GOTRUE_JWT_EXP: 3600
      GOTRUE_DB_DRIVER: postgres
      DATABASE_URL: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres?sslmode=disable"
      API_EXTERNAL_URL: http://localhost:54001  # Match new Kong port
      SITE_URL: http://localhost:54000  # Match new Studio port
      OPERATOR_TOKEN: "your-super-secret-operator-token"
    depends_on:
      - postgres

  rest:
    image: postgrest/postgrest:v11.2.0
    ports:
      - "54003:3000"  # High port for REST
    environment:
      PGRST_DB_URI: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres"
      PGRST_DB_SCHEMA: public
      PGRST_DB_ANON_ROLE: anon
      PGRST_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
    depends_on:
      - postgres

  meta:
    image: supabase/postgres-meta:v0.68.0
    ports:
      - "54004:8080"  # High port for Meta
    environment:
      PG_META_PORT: 8080
      PG_META_DB_HOST: postgres
      PG_META_DB_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
    depends_on:
      - postgres

volumes:
  pg_data:

↑ Back to top

supabase/init/00-init.sql (1.3 KiB)

-- Enable necessary extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

-- Create the gpu_metrics table
CREATE TABLE IF NOT EXISTS gpu_metrics (
    id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
    timestamp timestamptz NOT NULL DEFAULT now(),
    
    -- GPU Burn Metrics
    duration integer NOT NULL,
    errors integer NOT NULL,
    running boolean NOT NULL,
    
    -- Nvidia Info
    cuda_version text NOT NULL,
    driver_version text NOT NULL,
    
    -- GPU Metrics Array (stored as JSONB)
    gpus jsonb NOT NULL,
    
    -- Additional fields
    processes jsonb DEFAULT '[]'::jsonb,
    success boolean NOT NULL,
    
    -- Indexes for common queries
    created_at timestamptz NOT NULL DEFAULT now()
);

-- Create indexes for better query performance
CREATE INDEX IF NOT EXISTS idx_gpu_metrics_timestamp ON gpu_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_gpu_metrics_created_at ON gpu_metrics(created_at);

-- Set up row level security (RLS)
ALTER TABLE gpu_metrics ENABLE ROW LEVEL SECURITY;

-- Create a policy that allows all operations for now (we can restrict this later)
CREATE POLICY "Allow all operations on gpu_metrics" 
    ON gpu_metrics 
    FOR ALL 
    USING (true) 
    WITH CHECK (true);

-- Add a comment to the table
COMMENT ON TABLE gpu_metrics IS 'Stores GPU metrics data collected from NVIDIA GPUs';

↑ Back to top

supabase/kong.yml (499 B)

_format_version: "2.1"
_transform: true

services:
  - name: auth-v1
    url: http://auth:9999
    routes:
      - name: auth-v1-route
        paths:
          - /auth/v1
    plugins:
      - name: cors
  
  - name: rest-v1
    url: http://rest:3000
    routes:
      - name: rest-v1-route
        paths:
          - /rest/v1
    plugins:
      - name: cors

  - name: meta
    url: http://meta:8080
    routes:
      - name: meta-route
        paths:
          - /pg
    plugins:
      - name: cors

↑ Back to top

supabase/seed.sql (0 B)

↑ Back to top

supabase/start.sh (1.9 KiB)

#!/bin/bash

# Function to check if Docker is running
check_docker() {
    if ! docker info >/dev/null 2>&1; then
        echo "Error: Docker is not running"
        exit 1
    fi
}

# Function to generate a secure random string
generate_secret() {
    openssl rand -base64 32 | tr -dc 'a-zA-Z0-9' | head -c 32
}

# Function to update docker-compose.yml with secure credentials
update_credentials() {
    local pg_password=$(generate_secret)
    local jwt_secret=$(generate_secret)
    local operator_token=$(generate_secret)

    # Create credentials file
    cat > .env.supabase << EOF
POSTGRES_PASSWORD=$pg_password
JWT_SECRET=$jwt_secret
OPERATOR_TOKEN=$operator_token
EOF

    # Update docker-compose.yml with the new credentials
    sed -i.bak "s/your-super-secret-password/$pg_password/g" docker-compose.yml
    sed -i.bak "s/your-super-secret-jwt-token-with-at-least-32-characters/$jwt_secret/g" docker-compose.yml
    sed -i.bak "s/your-super-secret-operator-token/$operator_token/g" docker-compose.yml

    echo "Credentials saved to .env.supabase"
    chmod 600 .env.supabase
}

# Main script
echo "Starting Supabase local setup..."

# Check Docker
check_docker

# Create init directory if it doesn't exist
mkdir -p init

# Generate credentials if they don't exist
if [ ! -f .env.supabase ]; then
    echo "Generating secure credentials..."
    update_credentials
fi

# Start services
echo "Starting Supabase services..."
docker-compose up -d

echo "
Supabase is starting! The following services will be available:
- Studio:  http://localhost:54000
- API:     http://localhost:54001
- Auth:    http://localhost:54002
- REST:    http://localhost:54003
- Meta:    http://localhost:54004
- DB:      localhost:54432

Credentials are stored in .env.supabase
"

# Wait for services to be healthy
echo "Waiting for services to be ready..."
sleep 10

echo "Setup complete! You can now access Supabase Studio at http://localhost:54000"

↑ Back to top

supabase/stop.sh (350 B)

#!/bin/bash

echo "Stopping Supabase services..."

# Stop all containers
docker-compose down

echo "Supabase services stopped."

# Optional cleanup flag
if [ "$1" == "--clean" ]; then
    echo "Cleaning up volumes..."
    docker-compose down -v
    rm -f .env.supabase docker-compose.yml.bak
    echo "Cleanup complete. All data has been removed."
fi

↑ Back to top

🤖 LLM View - CXML Format

Copy the text below and paste it to an LLM for analysis:

<documents>
<document index="1">
<source>.dockerignore</source>
<document_content>
# Version control
.git
.gitignore

# Python
__pycache__
*.pyc
*.pyo
*.pyd
.Python
venv/
.env

# Node
node_modules/
npm-debug.log
yarn-debug.log
yarn-error.log

# Build
dist/
build/
*.egg-info/

# IDE
.idea/
.vscode/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

</document_content>
</document>
<document index="2">
<source>.editorconfig</source>
<document_content>
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
charset = utf-8
trim_trailing_whitespace = true

# Python files
[*.py]
indent_style = space
indent_size = 4
max_line_length = 100

# JavaScript/TypeScript files
[*.{js,jsx,ts,tsx}]
indent_style = space
indent_size = 2
max_line_length = 100

# YAML files
[*.{yml,yaml}]
indent_style = space
indent_size = 2

# Markdown files
[*.md]
trim_trailing_whitespace = false
max_line_length = off

# JSON files
[*.json]
indent_style = space
indent_size = 2

# Shell scripts
[*.sh]
indent_style = space
indent_size = 2
</document_content>
</document>
<document index="3">
<source>.env.example</source>
<document_content>
SUPABASE_URL=your_supabase_project_url
SUPABASE_KEY=your_supabase_anon_key
</document_content>
</document>
<document index="4">
<source>.env.template</source>
<document_content>
# Supabase Configuration
SUPABASE_URL=your-project-url-here
SUPABASE_KEY=your-anon-key-here

# API Configuration
API_URL=http://localhost:5000/api/gpu-stats
COLLECTION_INTERVAL=0.25  # in seconds

</document_content>
</document>
<document index="5">
<source>.github/ISSUE_TEMPLATE/bug_report.md</source>
<document_content>
---
name: Bug Report
about: Create a report to help us improve
title: '[BUG] '
labels: 'bug'
assignees: ''
---

**Describe the Bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. See error

**Expected Behavior**
A clear description of what you expected to happen.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**Environment:**
 - OS: [e.g., Ubuntu 22.04]
 - GPU: [e.g., NVIDIA RTX 3080]
 - Driver Version: [e.g., 535.183.01]
 - Browser: [e.g., Chrome 120]
 - GPU Sentinel Version: [e.g., 1.0.0]

**Additional Context**
- Backend logs (if applicable)
- Frontend logs (if applicable)
- Any error messages
</document_content>
</document>
<document index="6">
<source>.github/ISSUE_TEMPLATE/feature_request.md</source>
<document_content>
---
name: Feature Request
about: Suggest an idea for GPU Sentinel Pro
title: '[FEATURE] '
labels: 'enhancement'
assignees: ''
---

**Is your feature request related to a problem?**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Use Case**
Describe how you would use this feature in your workflow.

**Additional Context**
- Expected impact on performance
- Related features or dependencies
- Screenshots or mockups if applicable
- Any other context about the feature request
</document_content>
</document>
<document index="7">
<source>.github/dependabot.yml</source>
<document_content>
version: 2
updates:
  # Frontend dependencies
  - package-ecosystem: "npm"
    directory: "/frontend"
    schedule:
      interval: "weekly"
    open-pull-requests-limit: 10
    labels:
      - "npm"
      - "dependencies"
    commit-message:
      prefix: "chore"
      prefix-development: "chore"
      include: "scope"

  # Backend dependencies
  - package-ecosystem: "pip"
    directory: "/backend"
    schedule:
      interval: "weekly"
    open-pull-requests-limit: 10
    labels:
      - "pip"
      - "dependencies"
    commit-message:
      prefix: "chore"
      prefix-development: "chore"
      include: "scope"

  # GitHub Actions
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
    labels:
      - "github-actions"
      - "dependencies"
    commit-message:
      prefix: "chore"
      prefix-development: "chore"
      include: "scope"
</document_content>
</document>
<document index="8">
<source>.github/pull_request_template.md</source>
<document_content>
## Description
Brief description of the changes

## Type of Change
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
- [ ] Documentation update
- [ ] Performance improvement
- [ ] Code cleanup or refactor

## Testing
- [ ] Unit tests added/updated
- [ ] Manual testing performed
- [ ] All tests passing

## Screenshots
If applicable, add screenshots to help explain your changes.

## Checklist
- [ ] My code follows the project's style guidelines
- [ ] I have performed a self-review of my code
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have updated the CHANGELOG.md file

## Additional Notes
Any additional information that would be helpful for reviewers.
</document_content>
</document>
<document index="9">
<source>.github/workflows/ci.yml</source>
<document_content>
name: CI

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

jobs:
  backend-tests:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.11"]

    steps:
    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
    
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r backend/requirements.txt
        pip install pytest pytest-cov pylint black

    - name: Check formatting
      run: |
        black --check backend/

    - name: Lint with pylint
      run: |
        pylint backend/src/

    - name: Run tests with coverage
      run: |
        pytest backend/tests/ --cov=backend/src/ --cov-report=xml

  frontend-tests:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        node-version: [18.x, 20.x]

    steps:
    - uses: actions/checkout@v4
    - name: Use Node.js ${{ matrix.node-version }}
      uses: actions/setup-node@v3
      with:
        node-version: ${{ matrix.node-version }}
        
    - name: Install dependencies
      run: |
        cd frontend
        npm ci

    - name: Check formatting
      run: |
        cd frontend
        npm run format:check

    - name: Lint
      run: |
        cd frontend
        npm run lint

    - name: Type check
      run: |
        cd frontend
        npm run typecheck

    - name: Build
      run: |
        cd frontend
        npm run build
</document_content>
</document>
<document index="10">
<source>.github/workflows/codeql.yml</source>
<document_content>
name: "CodeQL"

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]
  schedule:
    - cron: '30 1 * * 0'  # Run at 1:30 UTC every Sunday

jobs:
  analyze:
    name: Analyze
    runs-on: ubuntu-latest
    permissions:
      actions: read
      contents: read
      security-events: write

    strategy:
      fail-fast: false
      matrix:
        language: [ 'python', 'javascript', 'typescript' ]

    steps:
    - name: Checkout repository
      uses: actions/checkout@v4

    - name: Initialize CodeQL
      uses: github/codeql-action/init@v2
      with:
        languages: ${{ matrix.language }}

    - name: Autobuild
      uses: github/codeql-action/autobuild@v2

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v2
      with:
        category: "/language:${{matrix.language}}"
</document_content>
</document>
<document index="11">
<source>.github/workflows/snyk.yml</source>
<document_content>
name: Snyk Security Scan

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
  schedule:
    - cron: '0 2 * * 0'  # Run at 2:00 UTC every Sunday

jobs:
  security:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.11'

      - name: Set up Node.js
        uses: actions/setup-node@v3
        with:
          node-version: '20'

      - name: Install dependencies
        run: |
          python -m pip install -r backend/requirements.txt
          cd frontend && npm install

      - name: Run Snyk to check for vulnerabilities
        uses: snyk/actions/python@master
        env:
          SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
        with:
          command: monitor

      - name: Run Snyk on frontend
        uses: snyk/actions/node@master
        env:
          SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
        with:
          command: monitor
          args: --all-projects
</document_content>
</document>
<document index="12">
<source>.github/workflows/sonarcloud.yml</source>
<document_content>
name: SonarCloud Analysis

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

jobs:
  sonarcloud:
    name: SonarCloud
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.11'

      - name: Set up Node.js
        uses: actions/setup-node@v3
        with:
          node-version: '20'

      - name: Install dependencies
        run: |
          python -m pip install -r backend/requirements.txt
          python -m pip install coverage pytest
          cd frontend && npm install

      - name: Run backend tests with coverage
        run: |
          cd backend
          coverage run -m pytest
          coverage xml -o coverage-reports/coverage.xml

      - name: Run frontend tests with coverage
        run: |
          cd frontend
          npm run test:coverage

      - name: SonarCloud Scan
        uses: SonarSource/sonarcloud-github-action@master
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
        with:
          args: >
            -Dsonar.organization=jackccrawford
            -Dsonar.projectKey=jackccrawford_gpu-sentinel-pro
            -Dsonar.python.coverage.reportPaths=backend/coverage-reports/coverage.xml
            -Dsonar.javascript.lcov.reportPaths=frontend/coverage/lcov.info
            -Dsonar.sources=backend/src,frontend/src
            -Dsonar.tests=backend/tests,frontend/src/**/*.test.tsx
</document_content>
</document>
<document index="13">
<source>.gitignore</source>
<document_content>
# Dependencies
node_modules/
**/node_modules/**
venv/
__pycache__/

# Environment files
.env
*.env

# Logs
*.log
frontend/frontend.log
backend/src/service/gpu_service.log

# Service PIDs
*.pid
frontend/frontend.pid
backend/src/service/service.pid

# Build files
dist/
build/
*.pyc

# IDE files
.vscode/
.idea/
*.swp
*.swo

# OS files
.DS_Store
Thumbs.db

# Supabase local dev
supabase/volumes/

# TypeScript
*.tsbuildinfo
frontend/.vite/
.vite/
dist-ssr/

# Package files
package-lock.json
**/package-lock.json

</document_content>
</document>
<document index="14">
<source>CHANGELOG.md</source>
<document_content>
# Changelog

All notable changes to GPU Sentinel Pro will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added
- Initial project structure
- Basic GPU metrics monitoring
- Real-time dashboard with React frontend
- FastAPI backend service
- Supabase integration for data storage
- Temperature and performance monitoring
- Dark/light mode support
- Multi-GPU support
- Basic alerting system

### Changed
- None

### Deprecated
- None

### Removed
- None

### Fixed
- None

### Security
- Basic CORS configuration
- Input validation
- SQL injection protection

## [1.0.0] - 2024-02-20

### Added
- Initial release
- Core monitoring functionality
- Basic dashboard
- Real-time updates
- Database integration

## Types of Changes
- `Added` for new features
- `Changed` for changes in existing functionality
- `Deprecated` for soon-to-be removed features
- `Removed` for now removed features
- `Fixed` for any bug fixes
- `Security` in case of vulnerabilities

## Versioning
- Major version (X.0.0) - Incompatible API changes
- Minor version (0.X.0) - Added functionality in a backward compatible manner
- Patch version (0.0.X) - Backward compatible bug fixes

[unreleased]: https://github.com/jackccrawford/gpu-sentinel-pro/compare/v1.0.0...HEAD
[1.0.0]: https://github.com/jackccrawford/gpu-sentinel-pro/releases/tag/v1.0.0
</document_content>
</document>
<document index="15">
<source>CONTRIBUTING.md</source>
<document_content>
# Contributing to GPU Sentinel Pro

Thank you for your interest in contributing to GPU Sentinel Pro! This document provides guidelines and workflows for contributing.

## Code of Conduct

- Be respectful and inclusive
- Provide constructive feedback
- Focus on the problem, not the person
- Help others learn and grow

## Getting Started

1. Fork the repository
2. Clone your fork:
   ```bash
   git clone https://github.com/YOUR_USERNAME/gpu-sentinel-pro.git
   ```
3. Add upstream remote:
   ```bash
   git remote add upstream https://github.com/jackccrawford/gpu-sentinel-pro.git
   ```
4. Create a feature branch:
   ```bash
   git checkout -b feature/your-feature-name
   ```

## Development Setup

### Backend Development
```bash
cd backend
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```

### Frontend Development
```bash
cd frontend
npm install
```

### Database Setup
```bash
cd supabase
docker-compose up -d
```

## Development Workflow

1. Check TODO.md for planned features
2. Create an issue for new features/bugs
3. Write code and tests
4. Update documentation
5. Submit pull request

### Commit Messages

Follow conventional commits:
- `feat:` New features
- `fix:` Bug fixes
- `docs:` Documentation changes
- `style:` Code style changes
- `refactor:` Code refactoring
- `test:` Test updates
- `chore:` Maintenance tasks

Example:
```bash
git commit -m "feat: add temperature trend analysis"
```

### Pull Request Process

1. Update documentation
2. Add/update tests
3. Ensure all tests pass
4. Update CHANGELOG.md
5. Request review

### Code Style

#### Python (Backend)
- Follow PEP 8
- Use type hints
- Document functions and classes
- Maximum line length: 100 characters

Example:
```python
def calculate_temperature_trend(
    temperatures: List[float],
    window_size: int = 10
) -> float:
    """
    Calculate temperature trend over time window.

    Args:
        temperatures: List of temperature readings
        window_size: Size of rolling window

    Returns:
        float: Temperature change rate
    """
    # Implementation
```

#### TypeScript (Frontend)
- Use ESLint configuration
- Document components and functions
- Use functional components
- Type all props and state

Example:
```typescript
interface TemperatureGraphProps {
  data: Temperature[];
  timeRange: TimeRange;
  onRangeChange: (range: TimeRange) => void;
}

const TemperatureGraph: React.FC<TemperatureGraphProps> = ({
  data,
  timeRange,
  onRangeChange,
}) => {
  // Implementation
};
```

### Testing

#### Backend Tests
```bash
cd backend
pytest
```

#### Frontend Tests
```bash
cd frontend
npm test
```

### Documentation

- Update API.md for endpoint changes
- Update INSTALLATION.md for setup changes
- Add JSDoc comments for frontend components
- Add docstrings for Python functions

## Feature Requests

1. Check existing issues and TODO.md
2. Create detailed issue with:
   - Use case
   - Expected behavior
   - Technical approach
   - Acceptance criteria

## Bug Reports

Include:
1. Description
2. Steps to reproduce
3. Expected behavior
4. Actual behavior
5. System information:
   - OS version
   - GPU model
   - Driver version
   - Software version

## Review Process

1. Code review by maintainers
2. CI/CD checks
3. Documentation review
4. Testing verification
5. Final approval

## Release Process

1. Version bump
2. Update CHANGELOG.md
3. Create release branch
4. Run test suite
5. Create GitHub release
6. Deploy to production

## Getting Help

- Check documentation
- Search existing issues
- Join discussions
- Ask questions in issues

## Recognition

Contributors will be:
- Listed in CONTRIBUTORS.md
- Mentioned in release notes
- Credited in documentation

Thank you for contributing to GPU Sentinel Pro!
</document_content>
</document>
<document index="16">
<source>Dockerfile</source>
<document_content>
FROM python:3.10-slim

WORKDIR /app

# Copy requirements first for better caching
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY backend/ backend/
COPY src/ src/

# Set environment variables
ENV PYTHONPATH=/app

# Run both collector and API server
CMD ["sh", "-c", "python -m src.collector.collector & python -m backend.src.service.app"]

</document_content>
</document>
<document index="17">
<source>LICENSE</source>
<document_content>
MIT License

Copyright (c) 2025 Jack Crawford

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

</document_content>
</document>
<document index="18">
<source>README.md</source>
<document_content>
# GPU Sentinel Pro

> "Information should not be displayed all at once; let people gradually become familiar with it." - Edward Tufte

Transform GPU monitoring from complex metrics into intuitive visual patterns. Enterprise-grade NVIDIA GPU monitoring with real-time analytics, intelligent alerts, and historical analysis.

[![CodeQL](https://github.com/jackccrawford/gpu-sentinel-pro/actions/workflows/codeql.yml/badge.svg)](https://github.com/jackccrawford/gpu-sentinel-pro/actions/workflows/codeql.yml)
[![X Follow](https://img.shields.io/badge/style--blue?style=social&logo=x&label=Follow%20%40jackccrawford)](https://x.com/intent/follow?screen_name=jackccrawford)
[![MIT License](https://img.shields.io/badge/License-MIT-green.svg)](https://choosealicense.com/licenses/mit/)
[![Built with Codeium](https://codeium.com/badges/main)](https://codeium.com)
![Python](https://img.shields.io/badge/Python-3.10%2B-blue?style=for-the-badge&logo=python&logoColor=white)
![FastAPI](https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi)
![React](https://img.shields.io/badge/React-20232A?style=for-the-badge&logo=react&logoColor=61DAFB)
![TypeScript](https://img.shields.io/badge/TypeScript-007ACC?style=for-the-badge&logo=typescript&logoColor=white)
![Supabase](https://img.shields.io/badge/Supabase-181818?style=for-the-badge&logo=supabase&logoColor=white)

![Dark Mode Dashboard](images/DarkMode-Stressed.png)
*Real-time GPU metrics visualized for instant comprehension*

## Quick Start

### Prerequisites
- NVIDIA GPU with compute capability 3.0 or higher
- NVIDIA Driver 450.80.02 or higher
- Python 3.8+ and Node.js 16.0+
- 4GB RAM (8GB recommended)
- 1GB free disk space

### Installation

1. Clone the repository:
```bash
git clone git@github.com:jackccrawford/gpu-sentinel-pro.git
cd gpu-sentinel-pro
```

2. Set up the backend:
```bash
cd backend
python -m venv venv
source venv/bin/activate  # On Windows: .\venv\Scripts\activate
pip install -r requirements.txt
```

3. Set up the frontend:
```bash
cd frontend
npm install
```

4. Start the services:
```bash
# Terminal 1 - Backend
cd backend
python src/service/app.py

# Terminal 2 - Frontend
cd frontend
npm run dev
```

5. Access the dashboard at http://localhost:5173

## Pro Features

### 🎯 Enterprise-Grade Monitoring
- **Real-time Visual Dashboard**
  - Modern React components with Material UI
  - Responsive design for desktop and mobile
  - Dark/light mode with automatic system preference detection
  - Multi-GPU support with individual monitoring panels

- **Advanced Metrics**
  - Temperature and utilization with color-coded ranges
  - Memory usage and bandwidth monitoring
  - Power consumption and efficiency tracking
  - Process-level GPU utilization
  - Custom metric aggregation

### 🔔 Intelligent Alert System
- **Configurable Thresholds**
  ```json
  {
    "temperature": {
      "warning": 75,
      "critical": 85
    },
    "memory": {
      "warning": 85,
      "critical": 95
    }
  }
  ```
- **Alert Types**
  - Temperature spikes
  - Memory leaks
  - Process crashes
  - Power anomalies
  - Custom conditions

### 📊 Analytics & Reporting
- **Historical Data**
  - Time-series metrics storage
  - Customizable retention policies
  - Data export in multiple formats
  - Trend analysis and forecasting

- **Performance Insights**
  - Workload pattern recognition
  - Resource utilization heatmaps
  - Efficiency recommendations
  - Cost analysis tools

### 🛠 Enterprise Integration
- **API Access**
  - RESTful API with OpenAPI documentation
  - Secure authentication
  - Rate limiting and quotas
  - Webhook support

- **Security**
  - Role-based access control
  - Audit logging
  - SSL/TLS encryption
  - Regular security updates

## Configuration

### Backend Settings
```python
# config.py
SETTINGS = {
    'update_interval': 1000,  # ms
    'retention_period': '30d',
    'log_level': 'INFO',
    'enable_analytics': True,
    'alert_cooldown': 300,  # seconds
}
```

### Frontend Configuration
```typescript
// config.ts
export const CONFIG = {
  API_URL: 'http://localhost:8000',
  REFRESH_RATE: 1000,
  THEME_MODE: 'system',  // 'light' | 'dark' | 'system'
  CHART_HISTORY: 300,    // data points
};
```

## System Architecture

```mermaid
graph TD
    A[Frontend React App] -->|HTTP/WebSocket| B[FastAPI Backend]
    B -->|NVML| C[GPU Hardware]
    B -->|Time Series| D[Supabase]
    B -->|Alerts| E[Notification Service]
```

## Contributing

1. Fork the repository
2. Create a feature branch: `git checkout -b feature/amazing-feature`
3. Commit your changes: `git commit -m 'feat: add amazing feature'`
4. Push to the branch: `git push origin feature/amazing-feature`
5. Open a pull request

## Support

- 📚 [Documentation](docs/)
- 🐛 [Issue Tracker](https://github.com/jackccrawford/gpu-sentinel-pro/issues)
- 💬 [Discussions](https://github.com/jackccrawford/gpu-sentinel-pro/discussions)
- 📧 [Contact Support](mailto:support@gpusentinel.pro)

## License

This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

---

</document_content>
</document>
<document index="19">
<source>SECURITY.md</source>
<document_content>
# Security Policy

## Supported Versions

| Version | Supported          |
| ------- | ------------------ |
| 1.0.x   | :white_check_mark: |

## Reporting a Vulnerability

We take security seriously at GPU Sentinel Pro. If you discover a security vulnerability, please follow these steps:

1. **Do Not** create a public GitHub issue
2. Send details to [security@example.com] (to be replaced with actual security contact)
3. Include:
   - Description of the vulnerability
   - Steps to reproduce
   - Potential impact
   - Suggested fix (if any)

## Response Timeline

- Initial response: Within 48 hours
- Status update: Within 5 business days
- Fix timeline: Based on severity
  - Critical: 7 days
  - High: 14 days
  - Medium: 30 days
  - Low: Next release

## Security Best Practices

### Production Deployment

1. **Authentication**
   - Use secure authentication methods
   - Implement rate limiting
   - Enable MFA where applicable

2. **Network Security**
   - Use HTTPS/TLS
   - Configure proper CORS settings
   - Implement firewall rules

3. **Database Security**
   - Use strong passwords
   - Regular backups
   - Encryption at rest
   - Limited network access

4. **API Security**
   - Input validation
   - Output sanitization
   - Token-based authentication
   - Rate limiting

### Development Security

1. **Code Security**
   - Regular dependency updates
   - Code scanning enabled
   - No secrets in code
   - Type checking enabled

2. **Access Control**
   - Principle of least privilege
   - Regular access review
   - Secure credential storage

3. **Data Protection**
   - Sensitive data encryption
   - Secure data transmission
   - Regular data cleanup

## Security Features

### Current Implementation
- Input validation
- SQL injection protection
- XSS protection
- CORS configuration
- Rate limiting

### Planned Features
- [ ] API authentication
- [ ] User role management
- [ ] Audit logging
- [ ] Enhanced encryption
- [ ] Automated security scanning

## Vulnerability Disclosure

We follow a responsible disclosure process:

1. Reporter submits vulnerability
2. Acknowledgment sent
3. Investigation conducted
4. Fix developed and tested
5. Fix deployed
6. Reporter notified
7. Public disclosure (if appropriate)

## Security Compliance

- Follow OWASP guidelines
- Regular security audits
- Dependency vulnerability scanning
- Code security analysis

## Contact

Security issues: [security@example.com]
General issues: GitHub Issues

## Recognition

We maintain a security hall of fame for responsible disclosure of vulnerabilities.

## Updates

This security policy is reviewed and updated quarterly.

Last updated: February 2024
</document_content>
</document>
<document index="20">
<source>TODO.md</source>
<document_content>
# GPU Sentinel Pro - Development Roadmap

## Core System Reliability
- [x] System Health Check & Diagnostics
  - [x] Graceful handling of missing NVIDIA drivers
  - [x] System requirements verification
  - [x] Driver version compatibility check
  - [x] Service connectivity status dashboard
  - [x] Auto-recovery procedures
  - [x] Installation troubleshooting guide

## Data Management
- [x] Logging Control Features
  - [x] Pause/Resume Supabase logging
  - [x] Data retention policy configuration
  - [x] Manual data export functionality
  - [x] Historical data cleanup tools
  - [x] Backup and restore capabilities

## Alert System
- [x] Alert Management Interface
  - [x] Alert history viewer
  - [x] Alert configuration dashboard
  - [x] Custom alert rules builder
  - [x] Notification preferences
  - [x] Alert acknowledgment workflow
  - [x] Alert severity configuration
  - [x] Email/webhook integration

## Analytics & Insights
- [x] Performance Analysis Tools
  - [x] Historical performance graphing
  - [x] Temperature trend analysis
  - [x] Power efficiency metrics
  - [x] Usage pattern recognition
  - [x] Performance anomaly detection
  - [x] Resource utilization heatmaps
  - [x] Cost analysis (power consumption)

## Advanced Features
- [ ] Workload Management
  - [ ] GPU task scheduling
  - [ ] Resource allocation optimization
  - [ ] Multi-user access control
  - [ ] Custom dashboard layouts
  - [ ] Performance benchmarking
  - [ ] Predictive maintenance
  - [ ] Container orchestration integration

## Integration & Extensions
- [ ] External System Integration
  - [ ] Kubernetes integration
  - [ ] Docker container stats
  - [ ] CI/CD pipeline monitoring
  - [ ] Cloud service provider metrics
  - [ ] External monitoring systems

## Documentation
- [ ] User Guides
  - [ ] Installation guides for different platforms
  - [ ] Configuration documentation
  - [ ] API documentation
  - [ ] Troubleshooting guides
  - [ ] Best practices

## Development Infrastructure
- [ ] Development Tools
  - [ ] Automated testing suite
  - [ ] CI/CD pipeline
  - [ ] Code quality checks
  - [ ] Performance testing framework
  - [ ] Development environment setup scripts

## Priority Queue (Next Up)
1. System Health Check implementation
2. Logging Control Features
3. Basic Alert Management
4. Historical Data Analysis
5. Advanced Features

## Notes
- Features will be implemented based on community feedback and real-world usage patterns
- Security considerations will be integrated into each feature
- Performance impact will be evaluated for each new feature
- Backward compatibility will be maintained where possible

## Contributing
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to these features.

## Feature Requests
Please use the [GitHub Issues](https://github.com/jackccrawford/gpu-sentinel-pro/issues) page to submit new feature requests or vote on existing ones.

</document_content>
</document>
<document index="21">
<source>backend/README.md</source>
<document_content>
# GPU Metrics Service Backend

A FastAPI service that monitors NVIDIA GPUs and provides metrics via REST API.

## Setup

1. Install dependencies:
```bash
pip install -r requirements.txt
```

2. Start the service:
```bash
python src/service/app.py
```

## API Endpoints

- `GET /api/gpu-stats` - Current GPU metrics
- `GET /api/gpu-stats/history` - Historical metrics
- `GET /api/alerts` - Recent alerts

## Configuration

Edit `src/service/config.yaml` to customize:
- Alert thresholds
- Polling intervals
- Data retention

</document_content>
</document>
<document index="22">
<source>backend/migrations/001_create_gpu_metrics_table.sql</source>
<document_content>
-- Create an extension for generating UUIDs if not exists
create extension if not exists "uuid-ossp";

-- Create the gpu_metrics table
create table if not exists gpu_metrics (
    id uuid primary key default uuid_generate_v4(),
    timestamp timestamptz not null default now(),
    
    -- GPU Burn Metrics
    duration integer not null,
    errors integer not null,
    running boolean not null,
    
    -- Nvidia Info
    cuda_version text not null,
    driver_version text not null,
    
    -- GPU Metrics Array (stored as JSONB)
    gpus jsonb not null,
    
    -- Additional fields
    processes jsonb default '[]'::jsonb,
    success boolean not null,
    
    -- Indexes for common queries
    created_at timestamptz not null default now()
);

-- Create indexes for better query performance
create index if not exists idx_gpu_metrics_timestamp on gpu_metrics(timestamp);
create index if not exists idx_gpu_metrics_created_at on gpu_metrics(created_at);

-- Add a comment to the table
comment on table gpu_metrics is 'Stores GPU metrics data collected from NVIDIA GPUs';
</document_content>
</document>
<document index="23">
<source>backend/migrations/002_create_alerts_table.sql</source>
<document_content>
-- Create alerts table
CREATE TABLE IF NOT EXISTS alert_thresholds (
    id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
    metric_name TEXT NOT NULL,
    warning_threshold FLOAT NOT NULL,
    critical_threshold FLOAT NOT NULL,
    duration_seconds INTEGER NOT NULL DEFAULT 60,
    enabled BOOLEAN NOT NULL DEFAULT true,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Create alerts history table
CREATE TABLE IF NOT EXISTS alert_history (
    id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
    alert_threshold_id uuid REFERENCES alert_thresholds(id),
    gpu_index INTEGER NOT NULL,
    metric_value FLOAT NOT NULL,
    threshold_value FLOAT NOT NULL,
    severity TEXT NOT NULL,  -- 'warning' or 'critical'
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Create index for querying recent alerts
CREATE INDEX IF NOT EXISTS idx_alert_history_created_at 
ON alert_history(created_at);

-- Create function to cleanup old alert history
CREATE OR REPLACE FUNCTION cleanup_old_alerts(days_to_keep INTEGER)
RETURNS void AS $$
BEGIN
    DELETE FROM alert_history 
    WHERE created_at < NOW() - (days_to_keep || ' days')::INTERVAL;
END;
$$ LANGUAGE plpgsql;

-- Create function to update alert thresholds from config
CREATE OR REPLACE FUNCTION update_alert_thresholds_from_config(
    config jsonb
) RETURNS void AS $$
BEGIN
    -- Temperature alerts
    INSERT INTO alert_thresholds (
        metric_name, 
        warning_threshold, 
        critical_threshold, 
        duration_seconds
    ) VALUES (
        'temperature',
        (config->'alerts'->>'temperature_warning')::float,
        (config->'alerts'->>'temperature_critical')::float,
        (config->'alerts'->>'temperature_duration')::integer
    ) ON CONFLICT (metric_name) DO UPDATE SET
        warning_threshold = EXCLUDED.warning_threshold,
        critical_threshold = EXCLUDED.critical_threshold,
        duration_seconds = EXCLUDED.duration_seconds,
        updated_at = NOW();

    -- GPU utilization alerts
    INSERT INTO alert_thresholds (
        metric_name, 
        warning_threshold, 
        critical_threshold, 
        duration_seconds
    ) VALUES (
        'gpu_utilization',
        (config->'alerts'->>'gpu_utilization_warning')::float,
        (config->'alerts'->>'gpu_utilization_critical')::float,
        (config->'alerts'->>'gpu_utilization_duration')::integer
    ) ON CONFLICT (metric_name) DO UPDATE SET
        warning_threshold = EXCLUDED.warning_threshold,
        critical_threshold = EXCLUDED.critical_threshold,
        duration_seconds = EXCLUDED.duration_seconds,
        updated_at = NOW();

    -- Memory usage alerts
    INSERT INTO alert_thresholds (
        metric_name, 
        warning_threshold, 
        critical_threshold, 
        duration_seconds
    ) VALUES (
        'memory_usage',
        (config->'alerts'->>'memory_usage_warning')::float,
        (config->'alerts'->>'memory_usage_critical')::float,
        (config->'alerts'->>'memory_usage_duration')::integer
    ) ON CONFLICT (metric_name) DO UPDATE SET
        warning_threshold = EXCLUDED.warning_threshold,
        critical_threshold = EXCLUDED.critical_threshold,
        duration_seconds = EXCLUDED.duration_seconds,
        updated_at = NOW();
END;
$$ LANGUAGE plpgsql;

-- Add unique constraint for metric_name
ALTER TABLE alert_thresholds 
ADD CONSTRAINT unique_metric_name UNIQUE (metric_name);

COMMENT ON TABLE alert_thresholds IS 'Stores configurable alert thresholds for GPU metrics';
COMMENT ON TABLE alert_history IS 'Stores history of triggered alerts';

</document_content>
</document>
<document index="24">
<source>backend/requirements.txt</source>
<document_content>
fastapi==0.109.2
uvicorn==0.27.1
psycopg2-binary==2.9.9
pyyaml==6.0.1
python-dotenv==1.0.1
pynvml==11.5.0
pandas==2.2.0
numpy==1.26.3
sqlalchemy==1.4.51
alembic==1.13.1
python-jose==3.3.0
fastapi-utils==0.2.1
prometheus-client==0.19.0
aiohttp==3.9.3
asyncpg==0.29.0
psutil==5.9.8

</document_content>
</document>
<document index="25">
<source>backend/src/__init__.py</source>
<document_content>

</document_content>
</document>
<document index="26">
<source>backend/src/database/__init__.py</source>
<document_content>

</document_content>
</document>
<document index="27">
<source>backend/src/database/client.py</source>
<document_content>
import psycopg2
from psycopg2.extras import Json
from datetime import datetime
from ..models.gpu_metrics import GpuMetricsRecord

class DatabaseClient:
    def __init__(self):
        self.conn_params = {
            'dbname': 'postgres',
            'user': 'postgres',
            'password': 'postgres',
            'host': 'localhost',
            'port': 54432
        }

    def get_connection(self):
        return psycopg2.connect(**self.conn_params)

    def insert_gpu_metrics(self, metrics: GpuMetricsRecord) -> dict:
        """
        Insert GPU metrics into PostgreSQL
        Returns the inserted record
        """
        if not metrics.timestamp:
            metrics.timestamp = datetime.utcnow().isoformat()

        data = metrics.model_dump()
        
        with self.get_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    INSERT INTO gpu_metrics (
                        timestamp,
                        duration,
                        errors,
                        running,
                        cuda_version,
                        driver_version,
                        gpus,
                        processes,
                        success,
                        created_at
                    ) VALUES (
                        %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW()
                    ) RETURNING id
                """, (
                    data['timestamp'],
                    data['gpu_burn_metrics']['duration'],
                    data['gpu_burn_metrics']['errors'],
                    data['gpu_burn_metrics']['running'],
                    data['nvidia_info']['cuda_version'],
                    data['nvidia_info']['driver_version'],
                    Json(data['gpus']),
                    Json(data['processes']),
                    data['success']
                ))
                record_id = cur.fetchone()[0]
                return {"id": record_id}

    def get_metrics_in_timerange(self, start_time: str, end_time: str):
        """
        Retrieve metrics within a specific time range
        """
        with self.get_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT *
                    FROM gpu_metrics
                    WHERE timestamp >= %s AND timestamp <= %s
                    ORDER BY timestamp DESC
                """, (start_time, end_time))
                
                columns = [desc[0] for desc in cur.description]
                results = []
                
                for row in cur.fetchall():
                    result = dict(zip(columns, row))
                    results.append(result)
                
                return results

# Create a singleton instance
db = DatabaseClient()

</document_content>
</document>
<document index="28">
<source>backend/src/database/config.py</source>
<document_content>
import os
from dotenv import load_dotenv

load_dotenv()

SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')

if not SUPABASE_URL or not SUPABASE_KEY:
    raise ValueError("Missing required Supabase credentials in environment variables")
</document_content>
</document>
<document index="29">
<source>backend/src/database/test_connection.py</source>
<document_content>
import os
from dotenv import load_dotenv
from supabase import create_client, Client
from datetime import datetime

# Load environment variables
load_dotenv()

# Get Supabase credentials
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')

def test_supabase_connection():
    try:
        if not SUPABASE_URL or not SUPABASE_KEY:
            print("Error: Missing Supabase credentials in .env file")
            return False

        # Initialize Supabase client
        supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
        
        # Try a simple query to test connection
        test_data = {
            "timestamp": datetime.utcnow().isoformat(),
            "test": True,
            "message": "Connection test"
        }
        
        print("Attempting to connect to Supabase...")
        result = supabase.table('gpu_metrics').insert(test_data).execute()
        
        print("Successfully connected to Supabase!")
        print("\nTest record inserted:")
        print(result)
        return True

    except Exception as e:
        print(f"Error connecting to Supabase: {e}")
        return False

if __name__ == "__main__":
    test_supabase_connection()
</document_content>
</document>
<document index="30">
<source>backend/src/models/__init__.py</source>
<document_content>

</document_content>
</document>
<document index="31">
<source>backend/src/models/gpu_metrics.py</source>
<document_content>
from typing import List, Optional
from pydantic import BaseModel


class GpuBurnMetrics(BaseModel):
    duration: int
    errors: int
    running: bool


class NvidiaInfo(BaseModel):
    cuda_version: str
    driver_version: str


class GpuMetrics(BaseModel):
    compute_mode: str
    fan_speed: int
    gpu_utilization: int
    index: int
    memory_total: int
    memory_used: int
    name: str
    peak_temperature: int
    power_draw: float
    power_limit: int
    temp_change_rate: int
    temperature: int


class GpuMetricsRecord(BaseModel):
    gpu_burn_metrics: GpuBurnMetrics
    gpus: List[GpuMetrics]
    nvidia_info: NvidiaInfo
    processes: List[dict] = []
    success: bool
    timestamp: Optional[str] = None  # We'll add this for tracking when the record was created
</document_content>
</document>
<document index="32">
<source>backend/src/service/__init__.py</source>
<document_content>

</document_content>
</document>
<document index="33">
<source>backend/src/service/alert_manager.py</source>
<document_content>
from datetime import datetime, timedelta
import logging
from .config import config
from src.database.client import db

logger = logging.getLogger(__name__)

class AlertManager:
    def __init__(self):
        self.alert_cache = {}  # {metric_name_gpu_index: last_alert_time}
        self.load_config()

    def load_config(self):
        """Load alert configuration"""
        self.thresholds = config.get('alerts')
        logger.info("Alert thresholds loaded from config")

    def check_metrics(self, gpu_metrics):
        """Check GPU metrics against thresholds"""
        alerts = []
        current_time = datetime.utcnow()

        for gpu in gpu_metrics.gpus:
            # Temperature checks
            alerts.extend(self._check_metric(
                metric_name='temperature',
                metric_value=gpu.temperature,
                gpu_index=gpu.index,
                warning=self.thresholds['temperature']['warning'],
                critical=self.thresholds['temperature']['critical'],
                duration=self.thresholds['temperature']['duration'],
                current_time=current_time
            ))

            # GPU utilization checks
            alerts.extend(self._check_metric(
                metric_name='gpu_utilization',
                metric_value=gpu.gpu_utilization,
                gpu_index=gpu.index,
                warning=self.thresholds['gpu_utilization']['warning'],
                critical=self.thresholds['gpu_utilization']['critical'],
                duration=self.thresholds['gpu_utilization']['duration'],
                current_time=current_time
            ))

            # Memory usage checks
            memory_usage_percent = (gpu.memory_used / gpu.memory_total) * 100
            alerts.extend(self._check_metric(
                metric_name='memory_usage',
                metric_value=memory_usage_percent,
                gpu_index=gpu.index,
                warning=self.thresholds['memory_usage']['warning'],
                critical=self.thresholds['memory_usage']['critical'],
                duration=self.thresholds['memory_usage']['duration'],
                current_time=current_time
            ))

            # Power usage checks
            power_usage_percent = (gpu.power_draw / gpu.power_limit) * 100
            alerts.extend(self._check_metric(
                metric_name='power_draw',
                metric_value=power_usage_percent,
                gpu_index=gpu.index,
                warning=self.thresholds['power_draw']['warning'],
                critical=self.thresholds['power_draw']['critical'],
                duration=self.thresholds['power_draw']['duration'],
                current_time=current_time
            ))

        if alerts:
            self._store_alerts(alerts)
            logger.warning(f"Generated {len(alerts)} alerts")

        return alerts

    def _check_metric(self, metric_name, metric_value, gpu_index, warning, critical, 
                     duration, current_time):
        """Check a single metric against its thresholds"""
        alerts = []
        cache_key = f"{metric_name}_gpu{gpu_index}"
        
        # Check if we should alert based on duration
        last_alert = self.alert_cache.get(cache_key)
        should_alert = (
            last_alert is None or 
            (current_time - last_alert).total_seconds() >= duration
        )

        if not should_alert:
            return alerts

        if metric_value >= critical:
            alerts.append({
                'metric_name': metric_name,
                'gpu_index': gpu_index,
                'value': metric_value,
                'threshold': critical,
                'severity': 'critical',
                'timestamp': current_time
            })
            self.alert_cache[cache_key] = current_time
        elif metric_value >= warning:
            alerts.append({
                'metric_name': metric_name,
                'gpu_index': gpu_index,
                'value': metric_value,
                'threshold': warning,
                'severity': 'warning',
                'timestamp': current_time
            })
            self.alert_cache[cache_key] = current_time

        return alerts

    def _store_alerts(self, alerts):
        """Store alerts in the database"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    for alert in alerts:
                        cur.execute("""
                            INSERT INTO alert_history (
                                gpu_index, metric_value, threshold_value, 
                                severity, created_at
                            ) VALUES (%s, %s, %s, %s, %s)
                        """, (
                            alert['gpu_index'],
                            alert['value'],
                            alert['threshold'],
                            alert['severity'],
                            alert['timestamp']
                        ))
        except Exception as e:
            logger.error(f"Failed to store alerts: {e}")

    def cleanup_old_alerts(self):
        """Clean up old alerts based on retention config"""
        try:
            retention_days = config.get('retention', 'days_to_keep')
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("SELECT cleanup_old_alerts(%s)", (retention_days,))
            logger.info(f"Cleaned up alerts older than {retention_days} days")
        except Exception as e:
            logger.error(f"Failed to cleanup old alerts: {e}")

    def get_recent_alerts(self, hours=24):
        """Get recent alerts"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        SELECT * FROM alert_history
                        WHERE created_at > NOW() - interval '%s hours'
                        ORDER BY created_at DESC
                    """, (hours,))
                    columns = [desc[0] for desc in cur.description]
                    return [dict(zip(columns, row)) for row in cur.fetchall()]
        except Exception as e:
            logger.error(f"Failed to get recent alerts: {e}")
            return []

# Create singleton instance
alert_manager = AlertManager()

</document_content>
</document>
<document index="34">
<source>backend/src/service/alerts.py</source>
<document_content>
from datetime import datetime, timedelta
import logging
from typing import List, Dict, Any
from src.service.settings import settings
from src.database.client import db
from src.models.gpu_metrics import GpuMetricsRecord

logger = logging.getLogger(__name__)

class AlertLevel:
    CRITICAL = "critical"
    WARNING = "warning"
    CAUTION = "caution"
    GOOD = "good"
    IDEAL = "ideal"

class AlertSystem:
    def __init__(self):
        # Cache structure: {f"{gpu_index}:{metric}:{severity}": timestamp}
        self.alert_cache = {}
        # Minimum time between similar alerts (5 minutes)
        self.alert_cooldown = timedelta(minutes=5)

    def should_trigger_alert(self, gpu_index: int, metric: str, 
                           severity: str, value: float) -> bool:
        """Determine if an alert should be triggered based on cache and cooldown"""
        cache_key = f"{gpu_index}:{metric}:{severity}"
        current_time = datetime.utcnow()

        # If no previous alert, always trigger
        if cache_key not in self.alert_cache:
            self.alert_cache[cache_key] = current_time
            return True

        # Check if enough time has passed since last alert
        last_alert_time = self.alert_cache[cache_key]
        if current_time - last_alert_time >= self.alert_cooldown:
            self.alert_cache[cache_key] = current_time
            return True

        return False

    def get_metric_level(self, metric: str, value: float) -> str:
        """Determine alert level for any metric based on thresholds"""
        thresholds = settings.get('alerts', metric)
        if value >= thresholds['critical']:
            return AlertLevel.CRITICAL
        elif value >= thresholds['warning']:
            return AlertLevel.WARNING
        elif value >= thresholds['caution']:
            return AlertLevel.CAUTION
        elif value >= thresholds['good']:
            return AlertLevel.GOOD
        return AlertLevel.IDEAL

    def check_metrics(self, metrics: GpuMetricsRecord) -> List[Dict[str, Any]]:
        """Check GPU metrics against all threshold levels"""
        alerts = []
        current_time = datetime.utcnow()

        for gpu in metrics.gpus:
            # Temperature check
            temp_level = self.get_metric_level('temperature', gpu.temperature)
            if temp_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'temperature', temp_level, gpu.temperature):
                    alerts.append(self._create_alert(
                        'temperature', gpu.index, gpu.temperature,
                        settings.get('alerts', 'temperature', temp_level),
                        temp_level, current_time
                    ))

            # GPU utilization check
            util_level = self.get_metric_level('gpu_utilization', gpu.gpu_utilization)
            if util_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'gpu_utilization', util_level, gpu.gpu_utilization):
                    alerts.append(self._create_alert(
                        'gpu_utilization', gpu.index, gpu.gpu_utilization,
                        settings.get('alerts', 'gpu_utilization', util_level),
                        util_level, current_time
                    ))

            # Fan speed check
            fan_level = self.get_metric_level('fan_speed', gpu.fan_speed)
            if fan_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'fan_speed', fan_level, gpu.fan_speed):
                    alerts.append(self._create_alert(
                        'fan_speed', gpu.index, gpu.fan_speed,
                        settings.get('alerts', 'fan_speed', fan_level),
                        fan_level, current_time
                    ))

            # Memory usage check
            memory_percent = (gpu.memory_used / gpu.memory_total) * 100
            mem_level = self.get_metric_level('memory_usage', memory_percent)
            if mem_level in [AlertLevel.CRITICAL, AlertLevel.WARNING]:
                if self.should_trigger_alert(gpu.index, 'memory_usage', mem_level, memory_percent):
                    alerts.append(self._create_alert(
                        'memory_usage', gpu.index, memory_percent,
                        settings.get('alerts', 'memory_usage', mem_level),
                        mem_level, current_time
                    ))

        if alerts:
            self._store_alerts(alerts)
            logger.warning(f"Generated {len(alerts)} alerts")

        return alerts

    def _create_alert(self, metric: str, gpu_index: int, value: float, 
                     threshold: float, severity: str, timestamp: datetime) -> Dict[str, Any]:
        """Create alert dictionary"""
        return {
            'metric': metric,
            'gpu_index': gpu_index,
            'value': value,
            'threshold': threshold,
            'severity': severity,
            'timestamp': timestamp
        }

    def _store_alerts(self, alerts: List[Dict[str, Any]]):
        """Store alerts in database"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    for alert in alerts:
                        cur.execute("""
                            INSERT INTO alert_history (
                                gpu_index, metric_value, threshold_value, 
                                severity, created_at
                            ) VALUES (%s, %s, %s, %s, %s)
                        """, (
                            alert['gpu_index'],
                            alert['value'],
                            alert['threshold'],
                            alert['severity'],
                            alert['timestamp']
                        ))
        except Exception as e:
            logger.error(f"Failed to store alerts: {e}")

    def get_recent_alerts(self, hours: int = 24) -> List[Dict[str, Any]]:
        """Get recent alerts from database"""
        try:
            with db.get_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        SELECT * FROM alert_history
                        WHERE created_at > NOW() - interval '%s hours'
                        ORDER BY created_at DESC
                    """, (hours,))
                    columns = [desc[0] for desc in cur.description]
                    return [dict(zip(columns, row)) for row in cur.fetchall()]
        except Exception as e:
            logger.error(f"Failed to get recent alerts: {e}")
            return []

# Create singleton instance
alert_system = AlertSystem()

</document_content>
</document>
<document index="35">
<source>backend/src/service/analytics_service.py</source>
<document_content>
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
import pandas as pd
import numpy as np
from scipy import stats
from .config import config
from src.database.client import db

logger = logging.getLogger(__name__)

class AnalyticsService:
    def __init__(self):
        self.anomaly_threshold = 2.0  # Standard deviations for anomaly detection

    def get_historical_metrics(self, start_time: datetime, 
                             end_time: datetime) -> pd.DataFrame:
        """Fetch historical GPU metrics within the specified time range."""
        try:
            with db.get_connection() as conn:
                query = """
                    SELECT 
                        timestamp,
                        jsonb_array_elements(gpus) as gpu_data
                    FROM gpu_metrics
                    WHERE timestamp BETWEEN %s AND %s
                    ORDER BY timestamp
                """
                df = pd.read_sql_query(query, conn, params=(start_time, end_time))
                
                # Parse GPU data from JSONB
                df['gpu_data'] = df['gpu_data'].apply(eval)
                metrics_df = pd.json_normalize(df['gpu_data'])
                
                # Combine with timestamp
                metrics_df['timestamp'] = df['timestamp']
                
                return metrics_df
        except Exception as e:
            logger.error(f"Error fetching historical metrics: {e}")
            return pd.DataFrame()

    def calculate_usage_patterns(self, days: int = 7) -> Dict:
        """Calculate GPU usage patterns over time."""
        end_time = datetime.now()
        start_time = end_time - timedelta(days=days)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return {}

        patterns = {
            'hourly_avg': self._calculate_hourly_averages(df),
            'daily_avg': self._calculate_daily_averages(df),
            'peak_usage_times': self._find_peak_usage_times(df),
            'utilization_distribution': self._calculate_utilization_distribution(df)
        }
        
        return patterns

    def detect_anomalies(self, hours: int = 24) -> List[Dict]:
        """Detect anomalies in GPU metrics."""
        end_time = datetime.now()
        start_time = end_time - timedelta(hours=hours)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return []

        anomalies = []
        
        # Check for anomalies in different metrics
        metrics = ['utilization', 'temperature', 'memory_used', 'power_draw']
        for metric in metrics:
            if metric in df.columns:
                anomalies.extend(
                    self._detect_metric_anomalies(df, metric)
                )
        
        return anomalies

    def analyze_performance_trends(self, days: int = 30) -> Dict:
        """Analyze long-term performance trends."""
        end_time = datetime.now()
        start_time = end_time - timedelta(days=days)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return {}

        trends = {
            'utilization_trend': self._calculate_trend(df, 'utilization'),
            'temperature_trend': self._calculate_trend(df, 'temperature'),
            'memory_trend': self._calculate_trend(df, 'memory_used'),
            'power_trend': self._calculate_trend(df, 'power_draw')
        }
        
        return trends

    def calculate_efficiency_metrics(self, days: int = 7) -> Dict:
        """Calculate GPU efficiency metrics."""
        end_time = datetime.now()
        start_time = end_time - timedelta(days=days)
        
        df = self.get_historical_metrics(start_time, end_time)
        if df.empty:
            return {}

        metrics = {}
        
        # Calculate power efficiency (GFLOPS/Watt if available)
        if all(col in df.columns for col in ['power_draw', 'utilization']):
            metrics['power_efficiency'] = self._calculate_power_efficiency(df)
        
        # Calculate memory efficiency
        if all(col in df.columns for col in ['memory_used', 'memory_total']):
            metrics['memory_efficiency'] = self._calculate_memory_efficiency(df)
        
        return metrics

    def _calculate_hourly_averages(self, df: pd.DataFrame) -> Dict:
        """Calculate average metrics by hour of day."""
        df['hour'] = df['timestamp'].dt.hour
        hourly_avg = df.groupby('hour').agg({
            'utilization': 'mean',
            'temperature': 'mean',
            'memory_used': 'mean',
            'power_draw': 'mean'
        }).to_dict()
        
        return hourly_avg

    def _calculate_daily_averages(self, df: pd.DataFrame) -> Dict:
        """Calculate average metrics by day of week."""
        df['day'] = df['timestamp'].dt.dayofweek
        daily_avg = df.groupby('day').agg({
            'utilization': 'mean',
            'temperature': 'mean',
            'memory_used': 'mean',
            'power_draw': 'mean'
        }).to_dict()
        
        return daily_avg

    def _find_peak_usage_times(self, df: pd.DataFrame) -> List[Dict]:
        """Find times of peak GPU usage."""
        peaks = []
        metrics = ['utilization', 'temperature', 'memory_used', 'power_draw']
        
        for metric in metrics:
            if metric in df.columns:
                peak_idx = df[metric].idxmax()
                peaks.append({
                    'metric': metric,
                    'value': df.loc[peak_idx, metric],
                    'timestamp': df.loc[peak_idx, 'timestamp']
                })
        
        return peaks

    def _calculate_utilization_distribution(self, df: pd.DataFrame) -> Dict:
        """Calculate distribution of GPU utilization."""
        if 'utilization' not in df.columns:
            return {}

        bins = [0, 20, 40, 60, 80, 100]
        labels = ['0-20%', '21-40%', '41-60%', '61-80%', '81-100%']
        df['util_bin'] = pd.cut(df['utilization'], bins=bins, labels=labels)
        
        distribution = df['util_bin'].value_counts().to_dict()
        return {str(k): v for k, v in distribution.items()}

    def _detect_metric_anomalies(self, df: pd.DataFrame, 
                               metric: str) -> List[Dict]:
        """Detect anomalies in a specific metric."""
        if metric not in df.columns:
            return []

        mean = df[metric].mean()
        std = df[metric].std()
        threshold = self.anomaly_threshold * std
        
        anomalies = []
        anomaly_points = df[abs(df[metric] - mean) > threshold]
        
        for idx, row in anomaly_points.iterrows():
            anomalies.append({
                'metric': metric,
                'value': row[metric],
                'timestamp': row['timestamp'],
                'deviation': abs(row[metric] - mean) / std
            })
        
        return anomalies

    def _calculate_trend(self, df: pd.DataFrame, metric: str) -> Dict:
        """Calculate trend for a specific metric."""
        if metric not in df.columns:
            return {}

        # Calculate linear regression
        x = np.arange(len(df))
        y = df[metric].values
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        
        return {
            'slope': slope,
            'r_squared': r_value**2,
            'p_value': p_value,
            'trend_direction': 'increasing' if slope > 0 else 'decreasing',
            'significance': p_value < 0.05
        }

    def _calculate_power_efficiency(self, df: pd.DataFrame) -> Dict:
        """Calculate power efficiency metrics."""
        efficiency = (df['utilization'] / df['power_draw']).mean()
        return {
            'avg_efficiency': efficiency,
            'peak_efficiency': (df['utilization'] / df['power_draw']).max()
        }

    def _calculate_memory_efficiency(self, df: pd.DataFrame) -> Dict:
        """Calculate memory efficiency metrics."""
        memory_util = (df['memory_used'] / df['memory_total']).mean() * 100
        return {
            'avg_memory_utilization': memory_util,
            'peak_memory_utilization': (df['memory_used'] / df['memory_total']).max() * 100
        }

# Create singleton instance
analytics_service = AnalyticsService()

</document_content>
</document>
<document index="36">
<source>backend/src/service/app.py</source>
<document_content>
import sys
from pathlib import Path
backend_dir = str(Path(__file__).resolve().parent.parent.parent)
if backend_dir not in sys.path:
    sys.path.insert(0, backend_dir)

from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
from fastapi.responses import JSONResponse
import subprocess
import json
import re
from collections import deque
from datetime import datetime, timedelta
from typing import Optional, List, Dict
import logging
import os
import shutil

# Import our components
from src.database.client import db
from src.models.gpu_metrics import GpuMetricsRecord, GpuBurnMetrics, NvidiaInfo, GpuMetrics
from src.service.alerts import alert_system
from src.service.system_health import SystemHealthCheck

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Initialize system health checker for nvidia-smi operations
system_health = SystemHealthCheck()

app = FastAPI(
    title="GPU Sentinel Pro API",
    description="""
    Enterprise-grade NVIDIA GPU monitoring API with real-time analytics and alerts.
    
    ## Features
    * Real-time GPU metrics monitoring
    * Historical data analysis
    * Alert system with configurable thresholds
    * System health diagnostics
    
    ## Authentication
    All endpoints are currently open. For enterprise deployments, configure authentication 
    as needed.
    
    ## Rate Limiting
    Default rate limit: 100 requests per minute per IP
    """,
    version="1.0.0",
    docs_url=None,  # Disable default docs to use custom endpoint
    redoc_url=None  # Disable default redoc to use custom endpoint
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# In-memory state
temperature_history = {}
peak_temperatures = {}
logging_enabled = True

# Custom documentation endpoints
@app.get("/docs", include_in_schema=False)
async def custom_swagger_ui_html():
    return get_swagger_ui_html(
        openapi_url="/openapi.json",
        title="GPU Sentinel Pro - API Documentation",
        swagger_js_url="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui-bundle.js",
        swagger_css_url="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui.css",
    )

@app.get("/redoc", include_in_schema=False)
async def redoc_html():
    return get_redoc_html(
        openapi_url="/openapi.json",
        title="GPU Sentinel Pro - API Documentation",
        redoc_js_url="https://cdn.jsdelivr.net/npm/redoc@next/bundles/redoc.standalone.js",
    )

def get_nvidia_info() -> NvidiaInfo:
    try:
        result = system_health._run_nvidia_command([system_health.nvidia_smi_path])
        cuda_version = "Unknown"
        driver_version = "Unknown"
        
        if result.stdout:
            cuda_match = re.search(r'CUDA Version: ([\d\.]+)', result.stdout)
            if cuda_match:
                cuda_version = cuda_match.group(1)
            
            driver_match = re.search(r'Driver Version: ([\d\.]+)', result.stdout)
            if driver_match:
                driver_version = driver_match.group(1)

        return NvidiaInfo(
            driver_version=driver_version,
            cuda_version=cuda_version
        )
    except Exception as e:
        logger.error(f"Error getting NVIDIA info: {str(e)}")
        return NvidiaInfo(
            driver_version="Unknown",
            cuda_version="Unknown"
        )

def get_gpu_metrics() -> GpuMetricsRecord:
    try:
        nvidia_info = get_nvidia_info()
        
        gpu_info = system_health._run_nvidia_command([
            system_health.nvidia_smi_path,
            "--query-gpu=index,name,fan.speed,power.draw,memory.total,memory.used,utilization.gpu,temperature.gpu,compute_mode,power.limit",
            "--format=csv,noheader,nounits"
        ])

        gpus = []
        current_time = datetime.now().timestamp()
        
        if gpu_info.stdout.strip():
            for line in gpu_info.stdout.strip().split('\n'):
                values = [v.strip() for v in line.split(',')]
                if len(values) >= 10:
                    gpu_index = int(values[0])
                    temperature = float(values[7])
                    
                    if gpu_index not in temperature_history:
                        temperature_history[gpu_index] = deque(maxlen=40)
                    temperature_history[gpu_index].append((current_time, temperature))
                    
                    if gpu_index not in peak_temperatures or temperature > peak_temperatures[gpu_index]:
                        peak_temperatures[gpu_index] = temperature

                    gpu = GpuMetrics(
                        index=gpu_index,
                        name=values[1],
                        fan_speed=int(float(values[2])),
                        power_draw=float(values[3]),
                        power_limit=int(float(values[9])),
                        memory_total=int(float(values[4])),
                        memory_used=int(float(values[5])),
                        gpu_utilization=int(float(values[6])),
                        temperature=int(temperature),
                        peak_temperature=int(peak_temperatures[gpu_index]),
                        temp_change_rate=0,
                        compute_mode=values[8]
                    )
                    gpus.append(gpu)

        metrics = GpuMetricsRecord(
            nvidia_info=nvidia_info,
            gpus=gpus,
            processes=[],
            gpu_burn_metrics=GpuBurnMetrics(
                running=False,
                duration=0,
                errors=0
            ),
            success=True,
            timestamp=datetime.utcnow().isoformat()
        )

        # Check for alerts
        alert_system.check_metrics(metrics)

        # Store in database only if logging is enabled
        if logging_enabled:
            try:
                db.insert_gpu_metrics(metrics)
                logger.info("Metrics stored in database")
            except Exception as e:
                logger.error(f"Failed to store metrics: {e}")
        else:
            logger.debug("Metrics logging is disabled, skipping database insert")

        return metrics
    except Exception as e:
        logger.error(f"Error getting GPU metrics: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/gpu-stats", 
    response_model=List[GpuMetrics],
    tags=["Metrics"],
    summary="Get current GPU statistics",
    description="Returns real-time metrics for all available NVIDIA GPUs including temperature, utilization, memory usage, and power consumption."
)
async def get_gpu_stats():
    """Service information and status"""
    try:
        metrics = get_gpu_metrics()
        return metrics.gpus
    except Exception as e:
        logger.error(f"Error getting GPU stats: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/gpu-stats/history",
    response_model=List[GpuMetricsRecord],
    tags=["Metrics"],
    summary="Get historical GPU metrics",
    description="""
    Retrieve historical GPU metrics within a specified time range.
    
    - Use ISO format for dates (e.g., 2025-02-08T20:00:00Z)
    - Default lookback period is 24 hours
    - Maximum lookback period is 168 hours (1 week)
    """
)
async def get_gpu_history(
    start_time: Optional[str] = Query(
        None,
        description="Start time in ISO format (default: 24 hours ago)"
    ),
    end_time: Optional[str] = Query(
        None,
        description="End time in ISO format (default: current time)"
    ),
    hours: Optional[int] = Query(
        24,
        description="Number of hours to look back (used if start_time not provided)",
        ge=1,
        le=168  # 1 week max
    )
):
    """Get historical GPU metrics"""
    try:
        # If no start_time provided, use hours parameter
        if not start_time:
            start_time = (datetime.utcnow() - timedelta(hours=hours)).isoformat()
        
        # If no end_time provided, use current time
        if not end_time:
            end_time = datetime.utcnow().isoformat()

        # Validate and parse timestamps
        try:
            datetime.fromisoformat(start_time.replace('Z', '+00:00'))
            datetime.fromisoformat(end_time.replace('Z', '+00:00'))
        except ValueError:
            raise HTTPException(
                status_code=400,
                detail="Invalid timestamp format. Use ISO format (e.g., 2024-01-01T00:00:00Z)"
            )

        return db.get_metrics_in_timerange(start_time, end_time)
    except Exception as e:
        logger.error(f"Error getting historical data: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/alerts",
    response_model=List[Dict],
    tags=["Alerts"],
    summary="Get recent alerts",
    description="Retrieve alerts generated within the specified time period. Includes temperature spikes, resource constraints, and system health issues."
)
async def get_alerts(
    hours: int = Query(
        24,
        description="Number of hours to look back",
        ge=1,
        le=168
    )
):
    """Get recent alerts"""
    return alert_system.get_recent_alerts(hours)

@app.post("/api/logging/toggle",
    response_model=Dict[str, bool],
    tags=["System"],
    summary="Toggle metrics logging",
    description="Enable or disable metrics logging to the database. Useful for maintenance or debugging."
)
async def toggle_logging():
    """Toggle metrics logging"""
    global logging_enabled
    logging_enabled = not logging_enabled
    logger.info(f"Metrics logging {'enabled' if logging_enabled else 'disabled'}")
    return {"logging_enabled": logging_enabled}

@app.get("/api/logging/status",
    response_model=Dict[str, bool],
    tags=["System"],
    summary="Get logging status",
    description="Check if metrics logging is currently enabled or disabled."
)
async def get_logging_status():
    """Get logging status"""
    return {"logging_enabled": logging_enabled}

@app.get("/")
async def root():
    """Service information and status"""
    return {
        "name": "GPU Metrics Service",
        "version": "1.0.0",
        "status": "running",
        "endpoints": {
            "GET /api/gpu-stats": "Current GPU metrics",
            "GET /api/gpu-stats/history": "Historical GPU metrics (optional: start_time, end_time, hours=24)",
            "GET /api/alerts": "Recent alerts",
            "GET /api/logging/status": "Get current logging status",
            "POST /api/logging/toggle": "Toggle metrics logging"
        }
    }

if __name__ == "__main__":
    import uvicorn
    logger.info("Starting GPU Metrics Service")
    uvicorn.run(
        "app:app",
        host="0.0.0.0",
        port=5183,
        reload=True
    )

</document_content>
</document>
<document index="37">
<source>backend/src/service/config.py</source>
<document_content>
import yaml
import os
from pathlib import Path

class Config:
    _instance = None
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(Config, cls).__new__(cls)
            cls._instance._load_config()
        return cls._instance
    
    def _load_config(self):
        config_path = Path(__file__).parent / 'config.yaml'
        with open(config_path, 'r') as f:
            self._config = yaml.safe_load(f)
            
    def get(self, *keys):
        """Get a config value using dot notation, e.g., config.get('polling', 'base_interval')"""
        value = self._config
        for key in keys:
            value = value[key]
        return value
    
    def reload(self):
        """Reload configuration file"""
        self._load_config()
        return self._config

# Create singleton instance
config = Config()


</document_content>
</document>
<document index="38">
<source>backend/src/service/config.yaml</source>
<document_content>
# Alert thresholds matching frontend
alerts:
  temperature:
    critical: 80
    warning: 70
    caution: 60
    good: 50
    # below 50 is ideal

  gpu_utilization:
    critical: 90
    warning: 75
    caution: 50
    good: 25
    # below 25 is ideal

  fan_speed:
    critical: 80
    warning: 65
    caution: 50
    good: 35
    # below 35 is ideal

  memory_usage:
    critical: 90
    warning: 75
    caution: 50
    good: 25
    # below 25 is ideal

# Polling intervals
polling:
  base_interval: 0.25  # 250ms
  max_interval: 10.0   # 10 seconds

# Data retention
retention:
  days_to_keep: 30
  cleanup_on_startup: true
  cleanup_on_shutdown: true

</document_content>
</document>
<document index="39">
<source>backend/src/service/gpu_service.log</source>
<document_content>
Traceback (most recent call last):
  File "/home/explora/dev/mvllc/git/gpu-sentinel-pro/backend/src/service/app.py", line 7, in <module>
    from fastapi import FastAPI, HTTPException, Query
ModuleNotFoundError: No module named 'fastapi'

</document_content>
</document>
<document index="40">
<source>backend/src/service/logging_manager.py</source>
<document_content>
import logging
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import psycopg2
from psycopg2.extras import Json
from contextlib import contextmanager
import yaml

logger = logging.getLogger(__name__)

class LoggingManager:
    def __init__(self, config_path: str = "config.yaml"):
        self.config_path = config_path
        self.config = self._load_config()
        self.is_logging_enabled = True
        self._setup_db_connection()

    def _load_config(self) -> Dict:
        """Load configuration from YAML file."""
        try:
            with open(self.config_path, 'r') as f:
                return yaml.safe_load(f)
        except Exception as e:
            logger.error(f"Error loading config: {e}")
            return {}

    def _setup_db_connection(self):
        """Setup database connection parameters."""
        db_config = self.config.get('database', {})
        self.db_params = {
            'dbname': db_config.get('name', 'gpu_sentinel'),
            'user': db_config.get('user', 'postgres'),
            'password': db_config.get('password', ''),
            'host': db_config.get('host', 'localhost'),
            'port': db_config.get('port', 5432)
        }

    @contextmanager
    def get_db_connection(self):
        """Context manager for database connections."""
        conn = None
        try:
            conn = psycopg2.connect(**self.db_params)
            yield conn
        except Exception as e:
            logger.error(f"Database connection error: {e}")
            raise
        finally:
            if conn:
                conn.close()

    def toggle_logging(self, enabled: bool) -> bool:
        """Enable or disable logging."""
        self.is_logging_enabled = enabled
        return self.is_logging_enabled

    def log_gpu_metrics(self, metrics: Dict) -> bool:
        """Log GPU metrics to database if logging is enabled."""
        if not self.is_logging_enabled:
            return False

        try:
            with self.get_db_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        INSERT INTO gpu_metrics (
                            timestamp, duration, errors, running,
                            cuda_version, driver_version, gpus, processes, success
                        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """, (
                        datetime.now(),
                        metrics.get('duration', 0),
                        metrics.get('errors', 0),
                        metrics.get('running', False),
                        metrics.get('cuda_version', ''),
                        metrics.get('driver_version', ''),
                        Json(metrics.get('gpus', [])),
                        Json(metrics.get('processes', [])),
                        metrics.get('success', False)
                    ))
                conn.commit()
            return True
        except Exception as e:
            logger.error(f"Error logging GPU metrics: {e}")
            return False

    def get_retention_policy(self) -> Dict[str, int]:
        """Get current data retention policy."""
        return {
            'metrics_retention_days': self.config.get('retention', {}).get('metrics_days', 30),
            'alerts_retention_days': self.config.get('retention', {}).get('alerts_days', 90)
        }

    def update_retention_policy(self, metrics_days: int, alerts_days: int) -> bool:
        """Update data retention policy."""
        try:
            self.config['retention'] = {
                'metrics_days': metrics_days,
                'alerts_days': alerts_days
            }
            with open(self.config_path, 'w') as f:
                yaml.dump(self.config, f)
            return True
        except Exception as e:
            logger.error(f"Error updating retention policy: {e}")
            return False

    def cleanup_old_data(self) -> Dict[str, int]:
        """Clean up data based on retention policy."""
        retention = self.get_retention_policy()
        deleted_counts = {'metrics': 0, 'alerts': 0}

        try:
            with self.get_db_connection() as conn:
                with conn.cursor() as cur:
                    # Clean up old metrics
                    cur.execute("""
                        DELETE FROM gpu_metrics
                        WHERE timestamp < NOW() - INTERVAL '%s days'
                        RETURNING COUNT(*)
                    """, (retention['metrics_retention_days'],))
                    deleted_counts['metrics'] = cur.fetchone()[0]

                    # Clean up old alerts
                    cur.execute("""
                        DELETE FROM alert_history
                        WHERE created_at < NOW() - INTERVAL '%s days'
                        RETURNING COUNT(*)
                    """, (retention['alerts_retention_days'],))
                    deleted_counts['alerts'] = cur.fetchone()[0]

                conn.commit()
            return deleted_counts
        except Exception as e:
            logger.error(f"Error cleaning up old data: {e}")
            return deleted_counts

    def export_data(self, start_date: datetime, end_date: datetime, 
                   export_path: str) -> bool:
        """Export data within date range to JSON file."""
        try:
            with self.get_db_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        SELECT * FROM gpu_metrics
                        WHERE timestamp BETWEEN %s AND %s
                        ORDER BY timestamp
                    """, (start_date, end_date))
                    
                    columns = [desc[0] for desc in cur.description]
                    data = []
                    
                    for row in cur:
                        data.append(dict(zip(columns, row)))

            # Convert datetime objects to ISO format
            for record in data:
                record['timestamp'] = record['timestamp'].isoformat()
                record['created_at'] = record['created_at'].isoformat()

            with open(export_path, 'w') as f:
                json.dump(data, f, indent=2)

            return True
        except Exception as e:
            logger.error(f"Error exporting data: {e}")
            return False

# Example usage
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    manager = LoggingManager()
    
    # Example: Toggle logging
    manager.toggle_logging(False)
    
    # Example: Update retention policy
    manager.update_retention_policy(metrics_days=60, alerts_days=120)
    
    # Example: Clean up old data
    deleted = manager.cleanup_old_data()
    print(f"Cleaned up {deleted['metrics']} metrics and {deleted['alerts']} alerts")

</document_content>
</document>
<document index="41">
<source>backend/src/service/run_service.sh</source>
<document_content>
#!/bin/bash

# Directory of this script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# Activate virtual environment
source $DIR/../../venv/bin/activate

# Start the service in background with nohup
nohup python $DIR/app.py > $DIR/gpu_service.log 2>&1 &

# Save the PID to a file
echo $! > $DIR/service.pid

echo "GPU Metrics Service started with PID $(cat $DIR/service.pid)"
echo "Logs available at $DIR/gpu_service.log"

</document_content>
</document>
<document index="42">
<source>backend/src/service/service.pid</source>
<document_content>
358598

</document_content>
</document>
<document index="43">
<source>backend/src/service/settings.py</source>
<document_content>
import yaml
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

class Settings:
    def __init__(self):
        self.config_path = Path(__file__).parent / 'config.yaml'
        self.load_config()

    def load_config(self):
        """Load configuration from yaml file"""
        try:
            with open(self.config_path, 'r') as f:
                self._config = yaml.safe_load(f)
                logger.info("Configuration loaded successfully")
        except Exception as e:
            logger.error(f"Error loading configuration: {e}")
            # Provide sensible defaults
            self._config = {
                'polling': {
                    'base_interval': 0.25,
                    'max_interval': 3600,
                    'activity_thresholds': {
                        'low': {'idle_time': 300, 'interval': 60},
                        'medium': {'idle_time': 1800, 'interval': 300},
                        'high': {'idle_time': 7200, 'interval': 3600}
                    }
                },
                'retention': {
                    'days_to_keep': 30,
                    'cleanup_on_startup': True,
                    'cleanup_on_shutdown': True
                },
                'alerts': {
                    'temperature': {'warning': 80, 'critical': 90},
                    'gpu_utilization': {'warning': 90, 'critical': 95},
                    'memory_usage': {'warning': 90, 'critical': 95}
                }
            }
            logger.info("Using default configuration")

    def get(self, *keys, default=None):
        """Get configuration value using dot notation"""
        try:
            value = self._config
            for key in keys:
                value = value[key]
            return value
        except (KeyError, TypeError):
            return default

    def reload(self):
        """Reload configuration file"""
        self.load_config()
        return self._config

# Create singleton instance
settings = Settings()

</document_content>
</document>
<document index="44">
<source>backend/src/service/stop_service.sh</source>
<document_content>
#!/bin/bash

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PID_FILE="$DIR/service.pid"

if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p $PID > /dev/null; then
        echo "Stopping GPU Metrics Service (PID: $PID)"
        kill $PID
        rm "$PID_FILE"
    else
        echo "Service not running (stale PID file)"
        rm "$PID_FILE"
    fi
else
    echo "No PID file found"
fi

</document_content>
</document>
<document index="45">
<source>backend/src/service/system_health.py</source>
<document_content>
import subprocess
import shutil
import logging
from typing import Dict, List, Optional
import re
import os

logger = logging.getLogger(__name__)

class SystemHealthCheck:
    def __init__(self):
        # Find nvidia-smi with full path and validate it
        self.nvidia_smi_path = shutil.which('nvidia-smi')
        if self.nvidia_smi_path:
            self.nvidia_smi_path = os.path.realpath(self.nvidia_smi_path)
            if not os.path.exists(self.nvidia_smi_path):
                self.nvidia_smi_path = None
            elif not os.access(self.nvidia_smi_path, os.X_OK):
                self.nvidia_smi_path = None
        self._driver_version = None
        self._cuda_version = None

    def _validate_nvidia_command(self, args: List[str]) -> bool:
        """Validate nvidia-smi command arguments"""
        valid_args = [
            "--query-gpu=gpu_name",
            "--query-gpu=gpu_name,gpu_bus_id,memory.total,compute_mode",
            "--query-gpu=driver_version",
            "--query-gpu=index,name,fan.speed,power.draw,memory.total,memory.used,utilization.gpu,temperature.gpu,compute_mode,power.limit",
            "--format=csv,noheader",
            "--format=csv,noheader,nounits"
        ]
        return all(arg in valid_args or arg == self.nvidia_smi_path for arg in args)

    def _run_nvidia_command(self, args: List[str]) -> subprocess.CompletedProcess:
        """Run nvidia-smi command with validation"""
        if not self.nvidia_smi_path:
            raise RuntimeError("nvidia-smi not found or not executable")
        
        if not self._validate_nvidia_command(args):
            raise ValueError("Invalid nvidia-smi arguments")

        return subprocess.run(
            args,
            capture_output=True,
            text=True,
            timeout=5,
            check=False  # We handle return code manually
        )

    def check_nvidia_smi(self) -> Dict[str, bool | str]:
        """Check if nvidia-smi is available and accessible."""
        if not self.nvidia_smi_path:
            logger.error("nvidia-smi not found in system PATH")
            return {
                "available": False,
                "error": "NVIDIA System Management Interface (nvidia-smi) not found. Please install NVIDIA drivers."
            }
        
        try:
            result = self._run_nvidia_command([
                self.nvidia_smi_path,
                "--query-gpu=gpu_name",
                "--format=csv,noheader"
            ])
            if result.returncode != 0:
                logger.error(f"nvidia-smi command failed: {result.stderr}")
                return {
                    "available": False,
                    "error": f"nvidia-smi command failed: {result.stderr}"
                }
            return {"available": True, "path": self.nvidia_smi_path}
        except subprocess.TimeoutExpired:
            logger.error("nvidia-smi command timed out")
            return {
                "available": False,
                "error": "nvidia-smi command timed out. System might be overloaded."
            }
        except Exception as e:
            logger.error(f"Error running nvidia-smi: {str(e)}")
            return {
                "available": False,
                "error": f"Error running nvidia-smi: {str(e)}"
            }

    def check_gpus(self) -> Dict[str, bool | List[str] | str]:
        """Check for available GPUs and their status."""
        if not self.nvidia_smi_path:
            return {
                "available": False,
                "error": "nvidia-smi not available",
                "gpus": []
            }

        try:
            result = self._run_nvidia_command([
                self.nvidia_smi_path,
                "--query-gpu=gpu_name,gpu_bus_id,memory.total,compute_mode",
                "--format=csv,noheader"
            ])
            
            if result.returncode != 0:
                return {
                    "available": False,
                    "error": f"GPU query failed: {result.stderr}",
                    "gpus": []
                }

            gpus = [gpu.strip() for gpu in result.stdout.split('\n') if gpu.strip()]
            
            if not gpus:
                return {
                    "available": False,
                    "error": "No GPUs detected",
                    "gpus": []
                }

            return {
                "available": True,
                "count": len(gpus),
                "gpus": gpus
            }

        except Exception as e:
            logger.error(f"Error checking GPUs: {str(e)}")
            return {
                "available": False,
                "error": f"Error checking GPUs: {str(e)}",
                "gpus": []
            }

    def check_driver_version(self) -> Dict[str, bool | str]:
        """Check NVIDIA driver version."""
        if not self.nvidia_smi_path:
            return {
                "available": False,
                "error": "nvidia-smi not available"
            }

        try:
            result = self._run_nvidia_command([
                self.nvidia_smi_path,
                "--query-gpu=driver_version",
                "--format=csv,noheader"
            ])
            
            if result.returncode != 0:
                return {
                    "available": False,
                    "error": f"Driver version query failed: {result.stderr}"
                }

            version = result.stdout.strip()
            if not version:
                return {
                    "available": False,
                    "error": "Could not determine driver version"
                }

            self._driver_version = version
            return {
                "available": True,
                "version": version
            }

        except Exception as e:
            logger.error(f"Error checking driver version: {str(e)}")
            return {
                "available": False,
                "error": f"Error checking driver version: {str(e)}"
            }

    def check_cuda_version(self) -> Dict[str, bool | str]:
        """Check CUDA version."""
        if not self.nvidia_smi_path:
            return {
                "available": False,
                "error": "nvidia-smi not available"
            }

        try:
            result = self._run_nvidia_command([self.nvidia_smi_path])
            
            if result.returncode != 0:
                return {
                    "available": False,
                    "error": f"CUDA version query failed: {result.stderr}"
                }

            # Look for CUDA Version in output
            cuda_match = re.search(r'CUDA Version:\s+(\d+\.\d+)', result.stdout)
            if not cuda_match:
                return {
                    "available": False,
                    "error": "Could not determine CUDA version"
                }

            self._cuda_version = cuda_match.group(1)
            return {
                "available": True,
                "version": self._cuda_version
            }

        except Exception as e:
            logger.error(f"Error checking CUDA version: {str(e)}")
            return {
                "available": False,
                "error": f"Error checking CUDA version: {str(e)}"
            }

    def check_memory_requirements(self) -> Dict[str, bool | str]:
        """Check if system meets memory requirements."""
        try:
            import psutil
            memory = psutil.virtual_memory()
            
            # Require at least 4GB of total RAM
            min_memory = 4 * 1024 * 1024 * 1024  # 4GB in bytes
            
            if memory.total < min_memory:
                return {
                    "meets_requirements": False,
                    "error": f"Insufficient system memory. Required: 4GB, Available: {memory.total / (1024**3):.1f}GB"
                }
                
            return {
                "meets_requirements": True,
                "total_memory": f"{memory.total / (1024**3):.1f}GB",
                "available_memory": f"{memory.available / (1024**3):.1f}GB"
            }
            
        except Exception as e:
            logger.error(f"Error checking system memory: {str(e)}")
            return {
                "meets_requirements": False,
                "error": f"Error checking system memory: {str(e)}"
            }

    def run_full_check(self) -> Dict[str, any]:
        """Run all system health checks."""
        return {
            "nvidia_smi": self.check_nvidia_smi(),
            "gpus": self.check_gpus(),
            "driver": self.check_driver_version(),
            "cuda": self.check_cuda_version(),
            "memory": self.check_memory_requirements(),
            "system_ready": all([
                self.check_nvidia_smi().get("available", False),
                self.check_gpus().get("available", False),
                self.check_driver_version().get("available", False),
                self.check_cuda_version().get("available", False),
                self.check_memory_requirements().get("meets_requirements", False)
            ])
        }

    def get_user_friendly_message(self, check_results: Dict[str, any]) -> str:
        """Generate a user-friendly message from check results."""
        if check_results["system_ready"]:
            return "System is ready for GPU monitoring."

        messages = []
        
        if not check_results["nvidia_smi"]["available"]:
            messages.append(f"NVIDIA SMI Issue: {check_results['nvidia_smi']['error']}")
        
        if not check_results["gpus"]["available"]:
            messages.append(f"GPU Issue: {check_results['gpus']['error']}")
        
        if not check_results["driver"]["available"]:
            messages.append(f"Driver Issue: {check_results['driver']['error']}")
        
        if not check_results["cuda"]["available"]:
            messages.append(f"CUDA Issue: {check_results['cuda']['error']}")
        
        if not check_results["memory"]["meets_requirements"]:
            messages.append(f"Memory Issue: {check_results['memory']['error']}")

        return "\n".join(messages)

# Example usage:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    health_check = SystemHealthCheck()
    results = health_check.run_full_check()
    print(health_check.get_user_friendly_message(results))
</document_content>
</document>
<document index="46">
<source>backend/src/service/test_alerts.py</source>
<document_content>
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.parent))

from src.service.alerts import alert_system
from src.models.gpu_metrics import GpuMetricsRecord, GpuMetrics, NvidiaInfo, GpuBurnMetrics
from datetime import datetime

def test_alerts():
    print("Testing alert system...")
    
    # Create test metrics with high temperature
    metrics = GpuMetricsRecord(
        gpu_burn_metrics=GpuBurnMetrics(
            duration=0,
            errors=0,
            running=False
        ),
        gpus=[
            GpuMetrics(
                compute_mode="Default",
                fan_speed=100,  # High fan speed
                gpu_utilization=95,  # High utilization
                index=0,
                memory_total=12288,
                memory_used=11674,  # High memory usage
                name="NVIDIA TITAN Xp",
                peak_temperature=85,
                power_draw=240,
                power_limit=250,
                temp_change_rate=0,
                temperature=85  # High temperature
            )
        ],
        nvidia_info=NvidiaInfo(
            cuda_version="12.2",
            driver_version="535.183.01"
        ),
        processes=[],
        success=True,
        timestamp=datetime.utcnow().isoformat()
    )

    # Check for alerts
    alerts = alert_system.check_metrics(metrics)
    print(f"\nGenerated {len(alerts)} alerts:")
    for alert in alerts:
        print(f"Alert: {alert['metric']} on GPU {alert['gpu_index']}")
        print(f"Value: {alert['value']:.1f}, Threshold: {alert['threshold']}")
        print(f"Severity: {alert['severity']}\n")

    # Get recent alerts
    recent = alert_system.get_recent_alerts(hours=1)
    print(f"Recent alerts in database: {len(recent)}")

if __name__ == "__main__":
    test_alerts()

</document_content>
</document>
<document index="47">
<source>backend/src/service/test_db.py</source>
<document_content>
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.parent))

from src.database.client import db
from src.models.gpu_metrics import GpuMetricsRecord, GpuBurnMetrics, NvidiaInfo, GpuMetrics
from datetime import datetime

def test_db_connection():
    try:
        # Create a test record
        metrics = GpuMetricsRecord(
            gpu_burn_metrics=GpuBurnMetrics(
                duration=0,
                errors=0,
                running=False
            ),
            gpus=[
                GpuMetrics(
                    compute_mode="Default",
                    fan_speed=30,
                    gpu_utilization=10,
                    index=0,
                    memory_total=12288,
                    memory_used=135,
                    name="NVIDIA TITAN Xp",
                    peak_temperature=48,
                    power_draw=67.17,
                    power_limit=250,
                    temp_change_rate=0,
                    temperature=48
                )
            ],
            nvidia_info=NvidiaInfo(
                cuda_version="12.2",
                driver_version="535.183.01"
            ),
            processes=[],
            success=True,
            timestamp=datetime.utcnow().isoformat()
        )

        print("Inserting test record...")
        result = db.insert_gpu_metrics(metrics)
        print(f"Insert successful, record ID: {result['id']}")

        print("\nRetrieving recent metrics...")
        recent = db.get_metrics_in_timerange(
            start_time=(datetime.utcnow().replace(hour=0, minute=0, second=0)).isoformat(),
            end_time=datetime.utcnow().isoformat()
        )
        print(f"Found {len(recent)} records today")
        
        return True
    except Exception as e:
        print(f"Error: {str(e)}")
        return False

if __name__ == "__main__":
    print("Testing database connection...")
    success = test_db_connection()
    print(f"\nTest {'successful' if success else 'failed'}")

</document_content>
</document>
<document index="48">
<source>backend/src/service/test_settings.py</source>
<document_content>
from settings import settings

def test_config():
    print("Testing configuration loading...")
    
    # Test basic config access
    base_interval = settings.get('polling', 'base_interval')
    print(f"Base polling interval: {base_interval}")
    
    # Test nested config access
    low_threshold = settings.get('polling', 'activity_thresholds', 'low', 'interval')
    print(f"Low activity polling interval: {low_threshold}")
    
    # Test default values
    unknown = settings.get('unknown', 'key', default='default_value')
    print(f"Unknown key with default: {unknown}")

if __name__ == "__main__":
    test_config()

</document_content>
</document>
<document index="49">
<source>docker-compose.yml</source>
<document_content>
version: '3.8'

services:
  backend:
    build: .
    container_name: gpu-sentinel-backend
    restart: always
    ports:
      - "5183:5183"
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    volumes:
      - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
      - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    networks:
      - gpu-net

  frontend:
    build: 
      context: ./frontend
      dockerfile: Dockerfile
    container_name: gpu-sentinel-frontend
    restart: always
    ports:
      - "5173:5173"
    depends_on:
      - backend
    networks:
      - gpu-net

networks:
  gpu-net:
    name: gpu-net
    external: true  # Use existing network where Ollama containers run

</document_content>
</document>
<document index="50">
<source>docs/API.md</source>
<document_content>
# GPU Sentinel Pro API Documentation

## API Overview
Base URL: `http://localhost:5500`

The GPU Sentinel Pro API provides real-time and historical GPU metrics through a RESTful interface.

## Endpoints

### Service Status
```http
GET /
```
Returns service information and status.

**Response Example:**
```json
{
    "name": "GPU Metrics Service",
    "version": "1.0.0",
    "status": "running",
    "endpoints": {
        "GET /api/gpu-stats": "Current GPU metrics",
        "GET /api/gpu-stats/history": "Historical GPU metrics",
        "GET /api/alerts": "Recent alerts"
    }
}
```

### Current GPU Statistics
```http
GET /api/gpu-stats
```
Returns real-time GPU metrics for all detected NVIDIA GPUs.

**Response Example:**
```json
{
    "nvidia_info": {
        "driver_version": "535.183.01",
        "cuda_version": "12.2"
    },
    "gpus": [
        {
            "index": 0,
            "name": "NVIDIA GeForce RTX 3080",
            "fan_speed": 45,
            "power_draw": 125.5,
            "power_limit": 250,
            "memory_total": 10240,
            "memory_used": 3584,
            "gpu_utilization": 85,
            "temperature": 72,
            "peak_temperature": 75,
            "temp_change_rate": 0.5,
            "compute_mode": "Default"
        }
    ],
    "processes": [],
    "gpu_burn_metrics": {
        "running": false,
        "duration": 0,
        "errors": 0
    },
    "success": true,
    "timestamp": "2024-02-20T15:30:00Z"
}
```

### Historical GPU Metrics
```http
GET /api/gpu-stats/history
```
Retrieves historical GPU metrics within a specified time range.

**Query Parameters:**
- `start_time` (optional): ISO format timestamp (e.g., "2024-02-20T00:00:00Z")
- `end_time` (optional): ISO format timestamp (e.g., "2024-02-20T23:59:59Z")
- `hours` (optional): Number of hours to look back (1-168, default: 24)

**Response Example:**
```json
[
    {
        "timestamp": "2024-02-20T15:00:00Z",
        "gpu_metrics": {
            // Same structure as current GPU stats
        }
    }
]
```

### Alert History
```http
GET /api/alerts
```
Retrieves recent system alerts.

**Query Parameters:**
- `hours` (optional): Number of hours of alert history to retrieve (default: 24)

**Response Example:**
```json
[
    {
        "id": "alert-123",
        "timestamp": "2024-02-20T15:25:00Z",
        "severity": "warning",
        "message": "GPU temperature exceeded 80°C",
        "gpu_index": 0,
        "metric": "temperature",
        "value": 82,
        "threshold": 80
    }
]
```

## Error Responses

### Validation Error
```json
{
    "detail": [
        {
            "loc": ["query", "hours"],
            "msg": "ensure this value is less than or equal to 168",
            "type": "value_error.number.not_le"
        }
    ]
}
```

## Rate Limiting
- Default: 100 requests per minute per IP
- Historical data endpoints: 30 requests per minute per IP

## Authentication
Currently using direct access. Token-based authentication planned for future releases.

## Best Practices
1. Use appropriate polling intervals (recommended: ≥250ms)
2. Include error handling for all API calls
3. Implement exponential backoff for retries
4. Cache responses when appropriate
5. Use historical endpoints for trend analysis
6. Monitor rate limits in production environments

## Future Endpoints (Planned)
- POST /api/alerts/config - Configure alert thresholds
- POST /api/logging/control - Control logging behavior
- GET /api/metrics/analysis - Get performance analysis
- POST /api/gpu/tasks - Manage GPU tasks

## Support
For API issues or feature requests, please use our [GitHub Issues](https://github.com/jackccrawford/gpu-sentinel-pro/issues) page.
</document_content>
</document>
<document index="51">
<source>docs/INSTALLATION.md</source>
<document_content>
# GPU Sentinel Pro Installation Guide

## Prerequisites

### System Requirements
- NVIDIA GPU(s)
- NVIDIA drivers installed and functional
- `nvidia-smi` command available
- Docker and Docker Compose (for database)
- Python 3.10 or higher
- Node.js 18 or higher

### NVIDIA Driver Verification
```bash
nvidia-smi
```
Should display your GPU information. If not, install NVIDIA drivers first.

## Quick Start

### 1. Clone the Repository
```bash
git clone https://github.com/jackccrawford/gpu-sentinel-pro.git
cd gpu-sentinel-pro
```

### 2. Database Setup (Supabase)
```bash
cd supabase
docker-compose up -d
```
Verify Supabase is running:
- Database: http://localhost:54432
- API: http://localhost:54321

### 3. Backend Setup
```bash
# Create and activate virtual environment
cd backend
python -m venv venv
source venv/bin/activate  # On Windows: .\venv\Scripts\activate

# Install dependencies
pip install -r requirements.txt

# Start the service
cd src/service
./run_service.sh
```
Verify backend is running:
- API: http://localhost:5500
- Documentation: http://localhost:5500/docs

### 4. Frontend Setup
```bash
cd frontend
npm install
./run_frontend.sh
```
Access the dashboard at http://localhost:3055

## Configuration

### Environment Variables
Create `.env` file in backend directory:
```env
SUPABASE_URL=http://localhost:54321
SUPABASE_KEY=your_supabase_key
```

### Database Migrations
```bash
cd backend/migrations
# Run migrations in order
psql -h localhost -p 54432 -U postgres -d postgres -f 001_create_gpu_metrics_table.sql
psql -h localhost -p 54432 -U postgres -d postgres -f 002_create_alerts_table.sql
```

### Alert Configuration
Edit `backend/src/service/config.yaml`:
```yaml
alerts:
  temperature:
    critical: 80
    warning: 70
    caution: 60
```

## Service Management

### Backend Service
- Start: `backend/src/service/run_service.sh`
- Stop: `backend/src/service/stop_service.sh`
- Logs: `backend/src/service/gpu_service.log`

### Frontend Service
- Start: `frontend/run_frontend.sh`
- Stop: `frontend/stop_frontend.sh`
- Logs: `frontend/frontend.log`

## Troubleshooting

### Common Issues

#### NVIDIA Driver Not Found
```bash
# Check driver installation
nvidia-smi

# If not found, install drivers
sudo ubuntu-drivers autoinstall  # Ubuntu
# or
sudo dnf install nvidia-driver   # Fedora
```

#### Database Connection Issues
```bash
# Check Supabase containers
docker ps | grep supabase

# Check logs
docker logs supabase-db-1

# Reset database
cd supabase
docker-compose down -v
docker-compose up -d
```

#### Service Won't Start
1. Check logs in respective log files
2. Verify ports are not in use:
   ```bash
   netstat -tulpn | grep -E '5500|3055|54321|54432'
   ```
3. Ensure all dependencies are installed
4. Verify Python/Node.js versions

## Security Considerations

### Production Deployment
1. Use proper SSL/TLS certificates
2. Configure proper authentication
3. Set up proper firewall rules
4. Use secure database passwords
5. Enable rate limiting

### Access Control
- Configure CORS settings in backend/src/service/app.py
- Set up proper database user permissions
- Use environment variables for sensitive data

## Updating

### Backend Updates
```bash
git pull
cd backend
source venv/bin/activate
pip install -r requirements.txt
./src/service/run_service.sh
```

### Frontend Updates
```bash
git pull
cd frontend
npm install
./run_frontend.sh
```

## Support
- GitHub Issues: [Report bugs](https://github.com/jackccrawford/gpu-sentinel-pro/issues)
- Documentation: [Full documentation](https://github.com/jackccrawford/gpu-sentinel-pro/docs)
</document_content>
</document>
<document index="52">
<source>docs/README.md</source>
<document_content>
# GPU Sentinel Pro Documentation

Welcome to the GPU Sentinel Pro documentation. This directory contains comprehensive documentation for users, developers, and contributors.

## Quick Links

### For Users
- [Installation Guide](INSTALLATION.md) - Complete setup instructions
- [API Documentation](API.md) - REST API endpoints and usage
- [Security Policy](../SECURITY.md) - Security guidelines and reporting
- [Changelog](../CHANGELOG.md) - Version history and updates

### For Developers
- [Contributing Guide](../CONTRIBUTING.md) - How to contribute to the project
- [Development Roadmap](../TODO.md) - Planned features and enhancements

## Documentation Structure

```
docs/
├── README.md          # This file - Documentation overview
├── API.md            # API endpoints and usage
├── INSTALLATION.md   # Installation and setup guide
└── ... (future docs)

project root/
├── CHANGELOG.md      # Version history
├── CONTRIBUTING.md   # Contribution guidelines
├── SECURITY.md      # Security policies
└── TODO.md          # Development roadmap
```

## Documentation Updates

Our documentation follows these principles:
- Clear and concise explanations
- Practical examples and use cases
- Regular updates with new features
- Version-specific information when needed

## Getting Started

1. New users should start with [INSTALLATION.md](INSTALLATION.md)
2. API users should refer to [API.md](API.md)
3. Contributors should read [CONTRIBUTING.md](../CONTRIBUTING.md)
4. For planned features, see [TODO.md](../TODO.md)

## Documentation TODOs

- [ ] Add troubleshooting guide
- [ ] Add API examples collection
- [ ] Add performance tuning guide
- [ ] Add deployment best practices
- [ ] Add architecture overview
- [ ] Add user interface guide

## Contributing to Docs

Documentation improvements are always welcome! Please see our [Contributing Guide](../CONTRIBUTING.md) for details on:
- Documentation style guide
- How to submit documentation changes
- Documentation testing
- Translation guidelines

## Support

If you find any issues in the documentation:
1. Check existing GitHub issues
2. Create a new issue if needed
3. Submit a pull request with fixes

## License

This documentation is licensed under the same terms as GPU Sentinel Pro. See [LICENSE](../LICENSE) for details.
</document_content>
</document>
<document index="53">
<source>docs/architecture/ARCHITECTURE.md</source>
<document_content>
# GPU Sentinel Pro - System Architecture

## System Overview

```mermaid
graph TB
    subgraph "Frontend Layer"
        R[React Application]
        V[Vite Dev Server]
    end

    subgraph "Backend Layer"
        F[FastAPI Server]
        N[NVIDIA SMI Interface]
        A[Alert Manager]
    end

    subgraph "Data Layer"
        S[(Supabase DB)]
        C[Cache Layer]
    end

    R -->|HTTP/WebSocket| F
    F -->|Query| S
    F -->|Commands| N
    F -->|Triggers| A
    A -->|Store| S
    F -->|Cache| C
```

## Component Architecture

### Frontend Components

```mermaid
graph TB
    subgraph "UI Layer"
        D[Dashboard]
        M[Metrics Display]
        A[Alert Panel]
        H[History View]
    end

    subgraph "State Management"
        Q[Query Client]
        S[State Store]
    end

    subgraph "Data Layer"
        AP[API Client]
        WS[WebSocket Client]
    end

    D --> M
    D --> A
    D --> H
    M --> Q
    A --> Q
    H --> Q
    Q --> AP
    Q --> WS
    Q --> S
```

### Backend Services

```mermaid
graph LR
    subgraph "API Layer"
        E[Endpoints]
        M[Middleware]
        A[Auth]
    end

    subgraph "Core Services"
        GM[GPU Monitor]
        AM[Alert Manager]
        HM[History Manager]
    end

    subgraph "Infrastructure"
        DB[Database]
        C[Cache]
        N[NVIDIA SMI]
    end

    E --> M
    M --> A
    M --> GM
    M --> AM
    M --> HM
    GM --> N
    AM --> DB
    HM --> DB
    GM --> C
```

## Data Flow

### Real-time Metrics Flow
1. NVIDIA SMI polls GPU metrics (250ms intervals)
2. Backend processes and validates data
3. WebSocket pushes updates to frontend
4. React components re-render with new data
5. Metrics stored in time-series database

### Alert Flow
1. Backend evaluates metrics against thresholds
2. Alert triggered if threshold exceeded
3. Alert stored in database
4. WebSocket pushes alert to frontend
5. Alert notification displayed
6. External notifications sent (email/webhook)

## Technical Components

### Frontend Stack
- **Framework**: React 18+
- **Language**: TypeScript 5+
- **Build Tool**: Vite
- **State Management**: React Query
- **UI Components**: Custom components
- **Data Visualization**: Custom charts
- **WebSocket Client**: Native WebSocket

### Backend Stack
- **Framework**: FastAPI
- **Language**: Python 3.10+
- **ASGI Server**: Uvicorn
- **Task Queue**: Background tasks
- **Caching**: In-memory + Redis
- **Monitoring**: Custom metrics

### Database Schema

#### GPU Metrics Table
```sql
CREATE TABLE gpu_metrics (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMPTZ NOT NULL,
    gpu_id INTEGER NOT NULL,
    temperature FLOAT,
    memory_used BIGINT,
    memory_total BIGINT,
    gpu_utilization INTEGER,
    power_draw FLOAT,
    power_limit FLOAT,
    fan_speed INTEGER,
    metadata JSONB,
    created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE INDEX idx_gpu_metrics_timestamp 
    ON gpu_metrics (timestamp DESC);
CREATE INDEX idx_gpu_metrics_gpu_id 
    ON gpu_metrics (gpu_id);
```

#### Alerts Table
```sql
CREATE TABLE alerts (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMPTZ NOT NULL,
    gpu_id INTEGER NOT NULL,
    alert_type VARCHAR(50) NOT NULL,
    severity VARCHAR(20) NOT NULL,
    message TEXT NOT NULL,
    value FLOAT,
    threshold FLOAT,
    acknowledged BOOLEAN DEFAULT FALSE,
    acknowledged_at TIMESTAMPTZ,
    created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE INDEX idx_alerts_timestamp 
    ON alerts (timestamp DESC);
CREATE INDEX idx_alerts_gpu_id 
    ON alerts (gpu_id);
```

## Security Architecture

### Authentication Flow
1. Client requests access
2. Server validates credentials
3. JWT token issued
4. Token included in subsequent requests
5. Token refresh mechanism

### Authorization Levels
- **Admin**: Full system access
- **User**: View and acknowledge alerts
- **Reader**: View-only access
- **API**: Programmatic access

### Data Security
- Encryption at rest
- TLS for data in transit
- Secure WebSocket connections
- Rate limiting
- Input validation

## Deployment Architecture

### Development Environment
```mermaid
graph LR
    D[Developer] --> L[Local Environment]
    L --> T[Tests]
    T --> G[Git]
    G --> A[GitHub Actions]
```

### Production Environment
```mermaid
graph LR
    G[GitHub] --> A[GitHub Actions]
    A --> B[Build]
    B --> T[Test]
    T --> D[Deploy]
    D --> P[Production]
```

## Performance Considerations

### Frontend Optimization
- Component memoization
- Virtual scrolling for large datasets
- Efficient re-rendering
- Asset optimization
- Code splitting

### Backend Optimization
- Connection pooling
- Query optimization
- Caching strategy
- Async operations
- Resource limits

### Database Optimization
- Partitioning strategy
- Index optimization
- Query performance
- Data retention
- Backup strategy

## Monitoring and Logging

### System Metrics
- API response times
- WebSocket performance
- Database query times
- Cache hit rates
- Error rates

### Application Logs
- Request/response logging
- Error tracking
- Performance metrics
- Security events
- System health

## Scalability Considerations

### Horizontal Scaling
- Stateless backend
- Load balancing
- Session management
- Cache distribution
- Database replication

### Vertical Scaling
- Resource optimization
- Memory management
- Connection pooling
- Query optimization
- Batch processing

## Future Architecture Considerations

### Planned Enhancements
- Kubernetes integration
- Cloud provider metrics
- ML-based predictions
- Advanced analytics
- Custom dashboards

### Technical Debt Management
- Code quality metrics
- Performance monitoring
- Security scanning
- Dependency updates
- Documentation updates

## Development Workflow

### Code Pipeline
```mermaid
graph LR
    F[Feature Branch] --> T[Tests]
    T --> R[Review]
    R --> M[Main Branch]
    M --> D[Deploy]
```

### Quality Assurance
- Automated testing
- Code review process
- Performance testing
- Security scanning
- Documentation review
</document_content>
</document>
<document index="54">
<source>docs/requirements/DEVELOPMENT_GUIDE.md</source>
<document_content>
# GPU Sentinel Pro - Development Guide

## Development Environment Setup

### Prerequisites
- Python 3.10+
- Node.js 18+
- NVIDIA GPU with drivers installed
- Docker and Docker Compose
- VS Code (recommended)

### Initial Setup

1. **Clone and Configure**
```bash
# Clone repository
git clone https://github.com/jackccrawford/gpu-sentinel-pro.git
cd gpu-sentinel-pro

# Create Python virtual environment
cd backend
python -m venv venv
source venv/bin/activate  # or .\venv\Scripts\activate on Windows

# Install backend dependencies
pip install -r requirements.txt

# Install frontend dependencies
cd ../frontend
npm install
```

2. **Environment Configuration**
```bash
# Backend (.env)
SUPABASE_URL=http://localhost:54321
SUPABASE_KEY=your-key
LOG_LEVEL=debug

# Frontend (.env)
VITE_API_URL=http://localhost:5500
VITE_UPDATE_INTERVAL=250
```

## Code Style Guidelines

### Python (Backend)

#### Style Guide
- Follow PEP 8
- Maximum line length: 100 characters
- Use type hints
- Use async/await for I/O operations

```python
# Good
async def get_gpu_metrics(gpu_id: int) -> Dict[str, Any]:
    """
    Fetch metrics for specific GPU.
    
    Args:
        gpu_id: ID of the GPU to monitor
        
    Returns:
        Dict containing GPU metrics
    """
    metrics = await nvidia_smi.get_metrics(gpu_id)
    return process_metrics(metrics)

# Bad
def get_gpu_metrics(id):
    metrics = nvidia_smi.get_metrics(id)
    return process_metrics(metrics)
```

#### Error Handling
```python
# Good
try:
    metrics = await nvidia_smi.get_metrics()
except NvidiaSMIError as e:
    logger.error(f"Failed to get GPU metrics: {e}")
    raise GPUError(str(e), "NVIDIA_SMI_ERROR")
except Exception as e:
    logger.error(f"Unexpected error: {e}")
    raise

# Bad
try:
    metrics = nvidia_smi.get_metrics()
except:
    print("Error")
```

### TypeScript (Frontend)

#### Style Guide
- Use functional components
- Use TypeScript types/interfaces
- Use React Query for data fetching
- Maximum line length: 100 characters

```typescript
// Good
interface MetricsDisplayProps {
  gpuId: number;
  refreshInterval: number;
}

const MetricsDisplay: React.FC<MetricsDisplayProps> = ({
  gpuId,
  refreshInterval,
}) => {
  const { data, error } = useQuery(['metrics', gpuId], fetchMetrics);
  
  if (error) return <ErrorDisplay error={error} />;
  if (!data) return <LoadingSpinner />;
  
  return <MetricsView data={data} />;
};

// Bad
function MetricsDisplay(props) {
  const [data, setData] = useState();
  useEffect(() => {
    fetch('/api/metrics').then(res => setData(res.data));
  }, []);
  return data ? <div>{data}</div> : null;
}
```

## Testing Standards

### Backend Testing

#### Unit Tests
```python
# test_metrics.py
import pytest
from unittest.mock import Mock, patch

class TestMetricsCollector:
    @pytest.fixture
    def collector(self):
        return MetricsCollector()
    
    @patch('nvidia_smi.get_metrics')
    async def test_collection(self, mock_get_metrics, collector):
        mock_get_metrics.return_value = {'temperature': 75}
        metrics = await collector.collect_metrics()
        assert metrics['temperature'] == 75
```

#### Integration Tests
```python
# test_api.py
from fastapi.testclient import TestClient
from .main import app

client = TestClient(app)

def test_metrics_endpoint():
    response = client.get("/api/gpu-stats")
    assert response.status_code == 200
    assert "gpus" in response.json()
```

### Frontend Testing

#### Component Tests
```typescript
// MetricsDisplay.test.tsx
import { render, screen } from '@testing-library/react';

describe('MetricsDisplay', () => {
  it('renders temperature correctly', () => {
    render(<MetricsDisplay gpuId={0} />);
    expect(screen.getByText(/Temperature/i)).toBeInTheDocument();
  });
});
```

#### Integration Tests
```typescript
// App.test.tsx
import { renderWithProviders } from '../test-utils';

test('full app rendering', async () => {
  const { container } = renderWithProviders(<App />);
  expect(container).toBeInTheDocument();
});
```

## Git Workflow

### Branch Naming
- Feature: `feature/description`
- Bug Fix: `fix/description`
- Documentation: `docs/description`
- Performance: `perf/description`

### Commit Messages
Follow conventional commits:
```bash
# Feature
git commit -m "feat: add temperature trend analysis"

# Bug fix
git commit -m "fix: correct memory usage calculation"

# Documentation
git commit -m "docs: update API documentation"

# Performance
git commit -m "perf: optimize metrics polling"
```

### Pull Request Process
1. Create feature branch
2. Implement changes
3. Add tests
4. Update documentation
5. Create PR with description
6. Address review comments
7. Merge after approval

## Debugging Guide

### Backend Debugging
```python
# Enable debug logging
import logging
logging.basicConfig(level=logging.DEBUG)

# Add debug points
logger.debug(f"Metrics collected: {metrics}")

# Use VS Code debugger
# launch.json configuration provided
```

### Frontend Debugging
```typescript
// Use React DevTools
// Chrome DevTools Configuration
{
  "react-developer-tools": true,
  "redux-devtools": true
}

// Debug logging
console.debug('Metrics updated:', metrics);
```

## Performance Optimization

### Backend Optimization
- Use connection pooling
- Implement caching
- Optimize database queries
- Use async I/O

### Frontend Optimization
- Implement memoization
- Use React.memo for components
- Optimize re-renders
- Implement virtualization

## Security Best Practices

### Backend Security
- Input validation
- Rate limiting
- Authentication
- CORS configuration

### Frontend Security
- XSS prevention
- CSRF protection
- Secure storage
- API error handling

## Deployment Process

### Development
1. Run tests
2. Update documentation
3. Create PR
4. Code review
5. Merge to main

### Staging
1. Deploy to staging
2. Run integration tests
3. Performance testing
4. Security scanning

### Production
1. Create release
2. Deploy to production
3. Monitor metrics
4. Verify functionality

## Monitoring and Logging

### Logging Standards
```python
# Backend logging
logger.info("API request received", extra={
    "endpoint": "/api/metrics",
    "method": "GET",
    "user_id": user_id
})

# Frontend logging
console.info('Metrics updated', {
    timestamp: new Date(),
    metrics: metricsData
});
```

### Monitoring Metrics
- Response times
- Error rates
- Resource usage
- User activity

## Support and Maintenance

### Issue Resolution
1. Reproduce issue
2. Identify root cause
3. Implement fix
4. Add regression test
5. Deploy solution

### Regular Maintenance
- Dependency updates
- Security patches
- Performance optimization
- Documentation updates
</document_content>
</document>
<document index="55">
<source>docs/requirements/REQUIREMENTS.md</source>
<document_content>
# GPU Sentinel Pro - User Requirements

## Overview
This document outlines the user requirements for GPU Sentinel Pro, organized as Epics and User Stories following agile methodologies.

## Epics

### 1. Real-Time Monitoring (E1)
Enable users to monitor GPU performance metrics in real-time with minimal cognitive load.

**User Stories:**
- [E1.S1] As a ML engineer, I want to see real-time GPU utilization so I can monitor my training jobs
- [E1.S2] As a data scientist, I want color-coded temperature indicators so I can quickly identify issues
- [E1.S3] As a developer, I want to see memory usage patterns so I can detect memory leaks
- [E1.S4] As a system admin, I want to monitor multiple GPUs simultaneously so I can manage cluster health
- [E1.S5] As a user, I want dark/light mode options so I can comfortably monitor in any lighting condition

**Acceptance Criteria:**
- Updates at least every 250ms
- Clear visual indicators for critical metrics
- Support for multi-GPU systems
- Responsive design for different screen sizes
- Configurable refresh rates

### 2. Alert System (E2)
Provide proactive notifications for critical GPU events and threshold breaches.

**User Stories:**
- [E2.S1] As a system admin, I want to set custom alert thresholds so I can prevent hardware damage
- [E2.S2] As a ML engineer, I want email notifications when training jobs complete or fail
- [E2.S3] As a team lead, I want alert history so I can track system health patterns
- [E2.S4] As a developer, I want webhook integration so I can connect alerts to our chat system
- [E2.S5] As an admin, I want to configure alert severity levels so I can prioritize responses

**Acceptance Criteria:**
- Configurable thresholds for all metrics
- Multiple notification channels
- Alert history retention
- Severity level management
- Alert acknowledgment system

### 3. Historical Analysis (E3)
Enable data-driven decisions through historical performance analysis.

**User Stories:**
- [E3.S1] As an analyst, I want to view historical performance data so I can optimize resource allocation
- [E3.S2] As a ML engineer, I want to analyze training job patterns so I can improve efficiency
- [E3.S3] As a manager, I want performance reports so I can plan hardware upgrades
- [E3.S4] As a developer, I want to export metrics so I can perform custom analysis
- [E3.S5] As a user, I want to compare performance across time periods so I can identify trends

**Acceptance Criteria:**
- Data retention configurable up to 30 days
- Export functionality in multiple formats
- Interactive visualization tools
- Custom date range selection
- Trend analysis capabilities

### 4. System Health Management (E4)
Provide comprehensive system health monitoring and management capabilities.

**User Stories:**
- [E4.S1] As an admin, I want to pause/resume logging so I can manage database storage
- [E4.S2] As a user, I want graceful handling of missing drivers so I can troubleshoot setup issues
- [E4.S3] As a developer, I want API access to health metrics so I can integrate with other tools
- [E4.S4] As an admin, I want backup/restore capabilities so I can preserve historical data
- [E4.S5] As a user, I want system requirements verification so I can ensure proper setup

**Acceptance Criteria:**
- Data management controls
- Graceful error handling
- RESTful API documentation
- Data integrity protection
- System diagnostics tools

### 5. Advanced Features (E5)
Provide enterprise-grade features for power users and organizations.

**User Stories:**
- [E5.S1] As a team lead, I want multi-user access control so I can manage team permissions
- [E5.S2] As a developer, I want custom dashboard layouts so I can focus on relevant metrics
- [E5.S3] As an admin, I want integration with container orchestration so I can monitor containerized workloads
- [E5.S4] As an analyst, I want predictive maintenance warnings so I can prevent failures
- [E5.S5] As a manager, I want cost analysis tools so I can optimize resource spending

**Acceptance Criteria:**
- Role-based access control
- Customizable dashboards
- Container metrics integration
- Predictive analytics
- Cost reporting tools

## Priority Matrix

| Priority | Epic | Rationale |
|----------|------|-----------|
| P0 | E1 - Real-Time Monitoring | Core functionality, immediate value |
| P1 | E4 - System Health | Essential for reliability |
| P2 | E2 - Alert System | Critical for proactive management |
| P3 | E3 - Historical Analysis | Important for optimization |
| P4 | E5 - Advanced Features | Enhanced value proposition |

## Technical Requirements

### Performance
- Frontend response time < 100ms
- Backend processing time < 150ms
- Support for up to 8 GPUs
- Minimal resource overhead

### Security
- API authentication
- Data encryption
- Secure websocket connections
- Access control management

### Reliability
- 99.9% uptime target
- Automatic error recovery
- Data backup mechanisms
- Graceful degradation

### Scalability
- Horizontal scaling support
- Efficient data storage
- Optimized query performance
- Resource-aware monitoring

## Implementation Phases

### Phase 1: Foundation
- Core monitoring functionality
- Basic UI implementation
- Database integration
- Error handling

### Phase 2: Enhancement
- Alert system
- Historical data
- User authentication
- API documentation

### Phase 3: Advanced
- Advanced analytics
- Custom dashboards
- Integration features
- Predictive capabilities

## Success Metrics

### User Experience
- UI response time < 100ms
- Error rate < 0.1%
- User satisfaction > 4.5/5

### System Performance
- CPU overhead < 5%
- Memory usage < 500MB
- Storage efficiency > 90%

### Business Impact
- Time saved in monitoring
- Incident prevention rate
- Resource optimization impact

## Maintenance Requirements

### Regular Updates
- Security patches
- Feature updates
- Performance optimizations
- Documentation updates

### Support
- Issue resolution
- User assistance
- Feature requests
- Bug fixes

## Future Considerations

### Scalability
- Cloud deployment options
- Enterprise features
- Additional integrations
- Performance enhancements

### Integration
- CI/CD systems
- Cloud providers
- Monitoring platforms
- Analytics tools
</document_content>
</document>
<document index="56">
<source>docs/requirements/TECHNICAL_SPEC.md</source>
<document_content>
# GPU Sentinel Pro - Technical Specification

## System Architecture

### Component Overview
```mermaid
graph TD
    A[Frontend React App] --> B[FastAPI Backend]
    B --> C[Supabase Database]
    B --> D[NVIDIA SMI]
    B --> E[Alert System]
    E --> F[Email/Webhook]
    B --> G[Historical Analytics]
```

### Technology Stack
- **Frontend**
  - React 18+
  - TypeScript 5+
  - Vite
  - Real-time data visualization

- **Backend**
  - FastAPI
  - Python 3.10+
  - NVIDIA SMI integration
  - WebSocket support

- **Database**
  - Supabase (PostgreSQL)
  - Time-series optimization
  - Data partitioning

## Core Features Implementation

### 1. Real-Time Monitoring

#### Data Collection
```python
class GPUMetricsCollector:
    POLLING_INTERVAL = 250  # milliseconds
    
    async def collect_metrics(self):
        metrics = await nvidia_smi.get_metrics()
        return self.process_metrics(metrics)
```

#### Frontend Updates
```typescript
interface MetricsState {
  current: GPUMetrics;
  history: MetricsHistory;
  alerts: Alert[];
}

const useMetrics = () => {
  const [metrics, setMetrics] = useState<MetricsState>();
  // Polling implementation
};
```

### 2. Alert System

#### Alert Rules Engine
```python
class AlertRule:
    def __init__(self, metric: str, threshold: float, condition: Callable):
        self.metric = metric
        self.threshold = threshold
        self.condition = condition

    def evaluate(self, value: float) -> bool:
        return self.condition(value, self.threshold)
```

#### Notification System
```python
class NotificationManager:
    async def send_alert(self, alert: Alert):
        match alert.channel:
            case "email":
                await self.send_email(alert)
            case "webhook":
                await self.send_webhook(alert)
```

### 3. Historical Analysis

#### Data Storage Schema
```sql
CREATE TABLE gpu_metrics (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMPTZ NOT NULL,
    gpu_id INTEGER NOT NULL,
    metric_type VARCHAR(50) NOT NULL,
    value DOUBLE PRECISION NOT NULL,
    metadata JSONB,
    CONSTRAINT unique_metric 
        UNIQUE (timestamp, gpu_id, metric_type)
);

CREATE INDEX idx_metrics_timestamp 
    ON gpu_metrics (timestamp DESC);
```

#### Analytics Queries
```python
class MetricsAnalyzer:
    async def get_trends(
        self, 
        start_time: datetime,
        end_time: datetime,
        metric: str
    ) -> DataFrame:
        query = """
        SELECT 
            time_bucket('5 minutes', timestamp) AS interval,
            avg(value) as avg_value,
            max(value) as max_value,
            min(value) as min_value
        FROM gpu_metrics
        WHERE 
            timestamp BETWEEN $1 AND $2
            AND metric_type = $3
        GROUP BY interval
        ORDER BY interval;
        """
        return await self.db.fetch_all(query, start_time, end_time, metric)
```

## Performance Optimizations

### Backend Optimizations
```python
class MetricsCache:
    def __init__(self):
        self.cache = TTLCache(maxsize=1000, ttl=300)  # 5 minutes
        
    async def get_or_fetch(self, key: str) -> dict:
        if key in self.cache:
            return self.cache[key]
        value = await self.fetch_from_db(key)
        self.cache[key] = value
        return value
```

### Frontend Optimizations
```typescript
const useMetricsOptimized = () => {
  const queryClient = useQueryClient();
  
  return useQuery({
    queryKey: ['metrics'],
    queryFn: fetchMetrics,
    staleTime: 250,
    cacheTime: 1000 * 60 * 5,
    refetchInterval: 250
  });
};
```

## Error Handling

### Backend Error Handling
```python
class GPUError(Exception):
    def __init__(self, message: str, error_code: str):
        self.message = message
        self.error_code = error_code
        super().__init__(self.message)

async def handle_gpu_error(error: GPUError):
    log.error(f"GPU Error: {error.error_code} - {error.message}")
    return JSONResponse(
        status_code=500,
        content={
            "error": error.error_code,
            "message": error.message,
            "timestamp": datetime.utcnow().isoformat()
        }
    )
```

### Frontend Error Boundaries
```typescript
class MetricsErrorBoundary extends React.Component {
  state = { hasError: false, error: null };
  
  static getDerivedStateFromError(error: Error) {
    return { hasError: true, error };
  }
  
  render() {
    if (this.state.hasError) {
      return <ErrorDisplay error={this.state.error} />;
    }
    return this.props.children;
  }
}
```

## Security Measures

### API Authentication
```python
class SecurityConfig:
    JWT_ALGORITHM = "HS256"
    JWT_EXPIRE_MINUTES = 60
    
    @staticmethod
    def create_access_token(data: dict) -> str:
        expire = datetime.utcnow() + timedelta(minutes=SecurityConfig.JWT_EXPIRE_MINUTES)
        to_encode = data.copy()
        to_encode.update({"exp": expire})
        return jwt.encode(to_encode, settings.SECRET_KEY, algorithm=SecurityConfig.JWT_ALGORITHM)
```

### Data Validation
```python
class MetricsValidator:
    @staticmethod
    def validate_gpu_metrics(metrics: dict) -> bool:
        required_fields = {'temperature', 'memory_used', 'gpu_utilization'}
        return all(
            isinstance(metrics.get(field), (int, float))
            for field in required_fields
        )
```

## Deployment Configuration

### Docker Setup
```dockerfile
# Backend
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "5500"]

# Frontend
FROM node:20-alpine
WORKDIR /app
COPY package*.json ./
RUN npm install
COPY . .
RUN npm run build
```

### Environment Configuration
```yaml
# Backend config
SUPABASE_URL: ${SUPABASE_URL}
SUPABASE_KEY: ${SUPABASE_KEY}
LOG_LEVEL: info
METRICS_RETENTION_DAYS: 30
ALERT_COOLDOWN_MINUTES: 5

# Frontend config
VITE_API_URL: ${API_URL}
VITE_WS_URL: ${WS_URL}
VITE_UPDATE_INTERVAL: 250
```

## Monitoring and Logging

### Application Logging
```python
class LogConfig:
    @staticmethod
    def setup_logging():
        logging.config.dictConfig({
            'version': 1,
            'disable_existing_loggers': False,
            'formatters': {
                'default': {
                    'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
                }
            },
            'handlers': {
                'console': {
                    'class': 'logging.StreamHandler',
                    'formatter': 'default'
                },
                'file': {
                    'class': 'logging.handlers.RotatingFileHandler',
                    'filename': 'gpu_service.log',
                    'maxBytes': 10485760,  # 10MB
                    'backupCount': 5,
                    'formatter': 'default'
                }
            },
            'root': {
                'level': 'INFO',
                'handlers': ['console', 'file']
            }
        })
```

## Testing Strategy

### Backend Tests
```python
class TestGPUMetrics:
    @pytest.fixture
    def metrics_collector(self):
        return GPUMetricsCollector()
    
    async def test_metrics_collection(self, metrics_collector):
        metrics = await metrics_collector.collect_metrics()
        assert 'temperature' in metrics
        assert isinstance(metrics['temperature'], (int, float))
```

### Frontend Tests
```typescript
describe('MetricsDisplay', () => {
  it('should update metrics every 250ms', async () => {
    const { result } = renderHook(() => useMetrics());
    await waitFor(() => {
      expect(result.current.data).toBeDefined();
    });
    expect(result.current.data.temperature).toBeGreaterThanOrEqual(0);
  });
});
```

## Performance Benchmarks

### Target Metrics
- API Response Time: < 100ms (95th percentile)
- Frontend Render Time: < 50ms
- Database Query Time: < 50ms
- Memory Usage: < 500MB
- CPU Usage: < 5% per core
</document_content>
</document>
<document index="57">
<source>frontend/Dockerfile</source>
<document_content>
FROM node:20-slim

WORKDIR /app

# Copy package files
COPY package*.json ./

# Install dependencies
RUN npm install

# Copy source code
COPY . .

# Build the application
RUN npm run build

# Install serve to run the built app
RUN npm install -g serve

# Serve the built application
CMD ["serve", "-s", "dist", "-l", "5173"]

</document_content>
</document>
<document index="58">
<source>frontend/README.md</source>
<document_content>
# GPU Metrics Dashboard Frontend

React-based dashboard for monitoring NVIDIA GPUs.

## Setup

1. Install dependencies:
```bash
npm install
```

2. Start the development server:
```bash
npm run dev
```

The dashboard will be available at http://localhost:5501

## Configuration

The dashboard connects to the backend service at http://localhost:5500 by default.
To change this, edit the API_URL in src/config.ts.

## Features

- Real-time GPU metrics display
- Temperature, utilization, and memory monitoring
- Historical data viewing
- Alert notifications

</document_content>
</document>
<document index="59">
<source>frontend/eslint.config.js</source>
<document_content>
import js from '@eslint/js'
import globals from 'globals'
import reactHooks from 'eslint-plugin-react-hooks'
import reactRefresh from 'eslint-plugin-react-refresh'
import tseslint from 'typescript-eslint'

export default tseslint.config(
  { ignores: ['dist'] },
  {
    extends: [js.configs.recommended, ...tseslint.configs.recommended],
    files: ['**/*.{ts,tsx}'],
    languageOptions: {
      ecmaVersion: 2020,
      globals: globals.browser,
    },
    plugins: {
      'react-hooks': reactHooks,
      'react-refresh': reactRefresh,
    },
    rules: {
      ...reactHooks.configs.recommended.rules,
      'react-refresh/only-export-components': [
        'warn',
        { allowConstantExport: true },
      ],
    },
  },
)

</document_content>
</document>
<document index="60">
<source>frontend/frontend.log</source>
<document_content>

> gpu-metrics-dashboard@1.0.0 dev
> vite


  VITE v4.5.5  ready in 235 ms

  ➜  Local:   http://localhost:3055/
  ➜  Network: http://192.168.0.224:3055/

</document_content>
</document>
<document index="61">
<source>frontend/frontend.pid</source>
<document_content>
359049

</document_content>
</document>
<document index="62">
<source>frontend/index.html</source>
<document_content>
<!doctype html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Vite + React + TS</title>
  </head>
  <body>
    <div id="root"></div>
    <script type="module" src="/src/main.tsx"></script>
  </body>
</html>

</document_content>
</document>
<document index="63">
<source>frontend/package.json</source>
<document_content>
{
  "name": "gpu-metrics-dashboard",
  "private": true,
  "version": "1.0.0",
  "type": "module",
  "scripts": {
    "dev": "vite",
    "build": "tsc && vite build",
    "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
    "preview": "vite preview"
  },
  "dependencies": {
    "@emotion/react": "^11.14.0",
    "@emotion/styled": "^11.14.0",
    "@mui/icons-material": "^5.16.14",
    "@mui/material": "^5.16.14",
    "axios": "^1.6.7",
    "date-fns": "^2.30.0",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
    "recharts": "^2.15.1",
    "zustand": "^4.4.7"
  },
  "devDependencies": {
    "@mui/types": "^7.2.21",
    "@tsconfig/node18": "^18.2.4",
    "@types/date-fns": "^2.5.3",
    "@types/react": "^18.2.15",
    "@types/react-dom": "^18.2.7",
    "@types/recharts": "^1.8.29",
    "@typescript-eslint/eslint-plugin": "^6.0.0",
    "@typescript-eslint/parser": "^6.0.0",
    "@vitejs/plugin-react": "^4.0.3",
    "eslint": "^8.45.0",
    "eslint-plugin-react-hooks": "^4.6.0",
    "eslint-plugin-react-refresh": "^0.4.3",
    "typescript": "^5.0.2",
    "vite": "^5.1.1"
  }
}

</document_content>
</document>
<document index="64">
<source>frontend/run_frontend.sh</source>
<document_content>
#!/bin/bash

# Directory of this script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
FRONTEND_PORT=3055
FRONTEND_LOG="frontend.log"
PID_FILE="frontend.pid"

cd $DIR

# Check if port is in use
if [[ $(lsof -i :${FRONTEND_PORT} | grep LISTEN) ]]; then
    echo "Port ${FRONTEND_PORT} is in use. Current processes:"
    lsof -i :${FRONTEND_PORT}
    read -p "Kill these processes? (y/N) " response
    if [[ "$response" =~ ^[Yy]$ ]]; then
        kill $(lsof -t -i :${FRONTEND_PORT})
        sleep 2
    else
        echo "Startup aborted"
        exit 1
    fi
fi

# Start the frontend service
nohup npm run dev > "${FRONTEND_LOG}" 2>&1 &
FRONTEND_PID=$!

# Store the PID
echo $FRONTEND_PID > "${PID_FILE}"
echo "Frontend started with PID ${FRONTEND_PID}"
echo "Logs available at $DIR/${FRONTEND_LOG}"

</document_content>
</document>
<document index="65">
<source>frontend/src/App.tsx</source>
<document_content>
import { useState, useEffect } from 'react'

const API_URL = 'http://localhost:5183'

/**
 * Represents comprehensive information about a single NVIDIA GPU
 * @interface GPUInfo
 */
interface GPUInfo {
  /** Unique index of the GPU in the system */
  index: number
  /** Full name/model of the GPU */
  name: string
  /** Current fan speed as a percentage (0-100) */
  fan_speed: number
  /** Current power consumption in watts */
  power_draw: number
  /** Maximum power limit in watts */
  power_limit: number
  /** Total GPU memory in megabytes */
  memory_total: number
  /** Currently used GPU memory in megabytes */
  memory_used: number
  /** Current GPU utilization as a percentage (0-100) */
  gpu_utilization: number
  /** Current GPU temperature in Celsius */
  temperature: number
  /** Highest recorded temperature in Celsius since last reset */
  peak_temperature: number
  /** Rate of temperature change in degrees Celsius per second */
  temp_change_rate: number
  /** Current compute mode of the GPU (e.g., 'Default', 'Exclusive Process') */
  compute_mode: string
}

/**
 * Metrics related to GPU stress testing/burn-in operations
 * @interface GPUBurnMetrics
 */
interface GPUBurnMetrics {
  /** Indicates if a GPU stress test is currently running */
  running: boolean
  /** Duration of the current/last stress test in seconds */
  duration: number
  /** Number of errors encountered during stress testing */
  errors: number
}

/**
 * Comprehensive GPU system information including all GPUs and system-wide metrics
 * @interface GPUData
 */
interface GPUData {
  /** Array of information for each GPU in the system */
  gpus: GPUInfo[]
  /** System-wide NVIDIA driver information */
  nvidia_info: {
    /** Installed NVIDIA driver version */
    driver_version: string
    /** Installed CUDA version */
    cuda_version: string
  }
  /** List of processes currently using GPU resources */
  processes: any[]
  /** Metrics from GPU stress testing */
  gpu_burn_metrics: GPUBurnMetrics
  /** Indicates if the data was retrieved successfully */
  success: boolean
}

/**
 * Theme configuration for the application's color scheme
 * @interface ThemeColors
 */
interface ThemeColors {
  /** Main background color of the application */
  background: string
  /** Background color for GPU info cards */
  cardBackground: string
  /** Primary text color */
  text: string
  /** Secondary/supplementary text color */
  subtext: string
  /** Border color for UI elements */
  border: string
  /** Background color for progress bar tracks */
  progressBackground: string
  /** Indicates if dark theme is active */
  isDark: boolean
}

const getColorScheme = (isDark: boolean) => ({
  critical: {
    light: '#DC2626', // deep red (visible on white)
    dark: '#FF6B6B'   // lighter red (visible on dark)
  },
  warning: {
    light: '#EA580C', // deep orange
    dark: '#FFA94D'   // lighter orange
  },
  caution: {
    light: '#CA8A04', // deep yellow-orange
    dark: '#FFD43B'   // lighter yellow
  },
  good: {
    light: '#16A34A', // deep green
    dark: '#51CF66'   // lighter green
  },
  ideal: {
    light: '#2563EB', // deep blue
    dark: '#339AF0'   // lighter blue
  }
});

const POLLING_INTERVALS = [
  { label: '250ms', value: 250 },
  { label: '500ms', value: 500 },
  { label: '1 second', value: 1000 },
  { label: '2 seconds', value: 2000 },
  { label: '5 seconds', value: 5000 },
  { label: '10 seconds', value: 10000 }
]

function App() {
  const [data, setData] = useState<GPUData | null>(null)
  const [error, setError] = useState<string | null>(null)
  const [darkMode, setDarkMode] = useState(() => {
    const saved = localStorage.getItem('darkMode')
    return saved ? JSON.parse(saved) : true
  })
  const [pollingInterval, setPollingInterval] = useState(() => {
    const saved = localStorage.getItem('pollingInterval')
    return saved ? parseInt(saved) : 1000
  })
  const [loggingEnabled, setLoggingEnabled] = useState(true)

  useEffect(() => {
    const fetchLoggingStatus = async () => {
      try {
        const response = await fetch(`${API_URL}/api/logging/status`)
        if (!response.ok) throw new Error('Failed to fetch logging status')
        const data = await response.json()
        setLoggingEnabled(data.logging_enabled)
      } catch (error) {
        console.error('Error fetching logging status:', error)
      }
    }
    fetchLoggingStatus()
  }, [])

  const toggleLogging = async () => {
    try {
      const response = await fetch(`${API_URL}/api/logging/toggle`, {
        method: 'POST'
      })
      if (!response.ok) throw new Error('Failed to toggle logging')
      const data = await response.json()
      setLoggingEnabled(data.logging_enabled)
    } catch (error) {
      console.error('Error toggling logging:', error)
    }
  }

  const theme: ThemeColors = darkMode ? {
    background: '#1a1a1a',
    cardBackground: '#2d2d2d',
    text: '#e1e1e1',
    subtext: '#a0a0a0',
    border: '#404040',
    progressBackground: '#404040',
    isDark: true
  } : {
    background: '#f8f9fa',
    cardBackground: '#ffffff',
    text: '#2c3e50',
    subtext: '#666666',
    border: '#e1e4e8',
    progressBackground: '#e9ecef',
    isDark: false
  }

  useEffect(() => {
    localStorage.setItem('darkMode', JSON.stringify(darkMode))
  }, [darkMode])

  useEffect(() => {
    console.log('Polling interval set to:', pollingInterval);
    localStorage.setItem('pollingInterval', pollingInterval.toString())
  }, [pollingInterval])

  useEffect(() => {
    const fetchData = async () => {
      const startTime = Date.now();
      console.log('Starting fetch at:', new Date(startTime).toLocaleTimeString());
      try {
        console.log('Fetching from:', `${API_URL}/api/gpu-stats`);
        const response = await fetch(`${API_URL}/api/gpu-stats`)
        if (!response.ok) {
          console.error('Response not OK:', response.status, response.statusText);
          const errorData = await response.json()
          throw new Error(errorData.detail || `HTTP error! Status: ${response.status}`);
        }
        const jsonData = await response.json()
        console.log('Response data:', jsonData);
        setData(jsonData)
        setError(null)
      } catch (error) {
        console.error('Fetch error:', error);
        setError(error instanceof Error ? error.message : 'Failed to fetch GPU data')
      }
    }

    console.log('Setting up data fetch with interval:', pollingInterval);
    fetchData()
    const interval = setInterval(fetchData, pollingInterval)
    return () => clearInterval(interval)
  }, [pollingInterval])

  useEffect(() => {
    const pulseAnimation = `
      @keyframes pulse {
        0% { opacity: 1; }
        50% { opacity: 0.5; }
        100% { opacity: 1; }
      }
    `

    const styleSheet = document.createElement('style')
    styleSheet.textContent = pulseAnimation
    document.head.appendChild(styleSheet)

    return () => {
      document.head.removeChild(styleSheet)
    }
  }, [])

  if (error) {
    return (
      <div style={{ 
        padding: '20px', 
        maxWidth: '800px', 
        margin: '40px auto',
        backgroundColor: theme.cardBackground,
        color: theme.text,
        borderRadius: '12px',
        border: `1px solid ${theme.border}`,
        boxShadow: '0 4px 6px rgba(0,0,0,0.1)'
      }}>
        <div style={{
          display: 'flex',
          alignItems: 'center',
          gap: '16px',
          marginBottom: '24px'
        }}>
          <svg width="32" height="32" viewBox="0 0 24 24" fill={getColorScheme(theme.isDark).warning[theme.isDark ? 'dark' : 'light']}>
            <path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"/>
          </svg>
          <h2 style={{ margin: 0, color: theme.text }}>GPU Monitoring Unavailable</h2>
        </div>
        
        <div style={{
          backgroundColor: theme.background,
          padding: '16px',
          borderRadius: '8px',
          marginBottom: '24px',
          fontFamily: 'monospace',
          fontSize: '14px',
          color: theme.subtext
        }}>
          {error}
        </div>

        <div style={{
          borderTop: `1px solid ${theme.border}`,
          paddingTop: '20px'
        }}>
          <h3 style={{ marginTop: 0, color: theme.text }}>Troubleshooting Steps:</h3>
          <ul style={{ color: theme.text, lineHeight: 1.6 }}>
            <li>Verify NVIDIA drivers are installed: <code style={{ backgroundColor: theme.background, padding: '2px 6px', borderRadius: '4px' }}>nvidia-smi</code></li>
            <li>Check GPU connection and power supply</li>
            <li>Ensure CUDA toolkit is properly installed</li>
            <li>Verify user permissions for GPU access</li>
            <li>Check system logs for driver errors</li>
          </ul>
          <button
            onClick={() => window.location.reload()}
            style={{
              padding: '8px 16px',
              backgroundColor: getColorScheme(theme.isDark).good[theme.isDark ? 'dark' : 'light'],
              color: '#fff',
              border: 'none',
              borderRadius: '6px',
              cursor: 'pointer',
              display: 'flex',
              alignItems: 'center',
              gap: '8px',
              marginTop: '16px'
            }}
          >
            <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
              <path d="M17.65 6.35C16.2 4.9 14.21 4 12 4c-4.42 0-7.99 3.58-7.99 8s3.57 8 7.99 8c3.73 0 6.84-2.55 7.73-6h-2.08c-.82 2.33-3.04 4-5.65 4-3.31 0-6-2.69-6-6s2.69-6 6-6c1.66 0 3.14.69 4.22 1.78L13 11h7V4l-2.35 2.35z"/>
            </svg>
            Retry Connection
          </button>
        </div>
      </div>
    )
  }

  if (!data) {
    return (
      <div style={{ 
        padding: '20px', 
        maxWidth: '800px', 
        margin: '40px auto',
        textAlign: 'center',
        color: theme.text
      }}>
        <div style={{
          display: 'inline-block',
          width: '40px',
          height: '40px',
          border: `4px solid ${theme.border}`,
          borderTopColor: getColorScheme(theme.isDark).good[theme.isDark ? 'dark' : 'light'],
          borderRadius: '50%',
          animation: 'spin 1s linear infinite'
        }} />
        <style>
          {`
            @keyframes spin {
              to { transform: rotate(360deg); }
            }
          `}
        </style>
        <p style={{ marginTop: '16px' }}>Connecting to GPU Monitoring Service...</p>
      </div>
    )
  }

  const getMetricColor = (value: number, theme: ThemeColors): string => {
    const colors = getColorScheme(theme.isDark);
    if (value >= 90) return theme.isDark ? colors.critical.dark : colors.critical.light;
    if (value >= 75) return theme.isDark ? colors.warning.dark : colors.warning.light;
    if (value >= 50) return theme.isDark ? colors.caution.dark : colors.caution.light;
    if (value >= 25) return theme.isDark ? colors.good.dark : colors.good.light;
    return theme.isDark ? colors.ideal.dark : colors.ideal.light;
  }

  const getTemperatureColor = (temp: number): string => {
    const colors = getColorScheme(theme.isDark);
    if (temp >= 80) return theme.isDark ? colors.critical.dark : colors.critical.light;
    if (temp >= 70) return theme.isDark ? colors.warning.dark : colors.warning.light;
    if (temp >= 60) return theme.isDark ? colors.caution.dark : colors.caution.light;
    if (temp >= 50) return theme.isDark ? colors.good.dark : colors.good.light;
    return theme.isDark ? colors.ideal.dark : colors.ideal.light;
  }

  const getUtilizationColor = (utilization: number, theme: ThemeColors): string => {
    return getMetricColor(utilization, theme);
  }

  const getFanSpeedColor = (speed: number, theme: ThemeColors): string => {
    const colors = getColorScheme(theme.isDark);
    if (speed > 80) return theme.isDark ? colors.critical.dark : colors.critical.light;
    if (speed > 65) return theme.isDark ? colors.warning.dark : colors.warning.light;
    if (speed > 50) return theme.isDark ? colors.caution.dark : colors.caution.light;
    if (speed > 35) return theme.isDark ? colors.good.dark : colors.good.light;
    return theme.isDark ? colors.ideal.dark : colors.ideal.light;
  }

  const getTemperatureIcon = (rate: number): { icon: string; color: string } => {
    const colors = getColorScheme(theme.isDark);
    if (Math.abs(rate) < 1.0) return { icon: '', color: theme.text };
    return rate > 0 
      ? { icon: '⌃', color: theme.isDark ? colors.critical.dark : colors.critical.light }  // Rising temp
      : { icon: '⌄', color: theme.isDark ? colors.good.dark : colors.good.light };         // Falling temp
  }

  return (
    <div style={{ 
      padding: '20px', 
      maxWidth: '1200px', 
      margin: '0 auto', 
      fontFamily: 'system-ui, -apple-system, sans-serif',
      backgroundColor: theme.background,
      color: theme.text,
      minHeight: '100vh'
    }}>
      <div style={{ 
        display: 'flex', 
        justifyContent: 'space-between', 
        alignItems: 'center', 
        marginBottom: '20px',
        flexWrap: 'wrap',
        gap: '10px'
      }}>
        <div style={{ display: 'flex', alignItems: 'center', gap: '15px', flexWrap: 'wrap' }}>
          <h1 style={{ margin: 0, color: theme.text }}>
            NVIDIA-SMI {data.nvidia_info.driver_version}
          </h1>
          <div style={{ 
            display: 'flex', 
            gap: '20px',
            color: theme.subtext,
            fontSize: '1rem',
            fontFamily: 'monospace',
            flexWrap: 'wrap',
            alignItems: 'center'
          }}>
            <span style={{ 
              padding: '2px 6px',
              backgroundColor: theme.cardBackground,
              border: `1px solid ${theme.border}`,
              borderRadius: '4px',
              fontSize: '0.9em',
              display: 'flex',
              gap: '6px'
            }}>
              <span style={{ color: theme.subtext }}>Driver:</span>
              {data.nvidia_info.driver_version}
            </span>
            <span style={{ 
              padding: '2px 6px',
              backgroundColor: theme.cardBackground,
              border: `1px solid ${theme.border}`,
              borderRadius: '4px',
              fontSize: '0.9em',
              display: 'flex',
              gap: '6px'
            }}>
              <span style={{ color: theme.subtext }}>CUDA:</span>
              12.2
            </span>
          </div>
        </div>
        <div style={{ display: 'flex', gap: '10px', alignItems: 'center' }}>
          <select
            value={pollingInterval}
            onChange={(e) => setPollingInterval(parseInt(e.target.value))}
            style={{
              padding: '8px 12px',
              borderRadius: '8px',
              border: `1px solid ${theme.border}`,
              backgroundColor: theme.cardBackground,
              color: theme.text,
              cursor: 'pointer',
              fontSize: '14px'
            }}
          >
            {POLLING_INTERVALS.map(interval => (
              <option key={interval.value} value={interval.value}>
                Update every {interval.label}
              </option>
            ))}
          </select>
          <button
            onClick={() => setDarkMode(!darkMode)}
            style={{
              padding: '8px 12px',
              borderRadius: '8px',
              border: `1px solid ${theme.border}`,
              backgroundColor: theme.cardBackground,
              color: theme.text,
              cursor: 'pointer',
              display: 'flex',
              alignItems: 'center',
              gap: '5px'
            }}
          >
            {darkMode ? (
              <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
                <path d="M12 7c-2.76 0-5 2.24-5 5s2.24 5 5 5 5-2.24 5-5-2.24-5-5-5zM2 13h2c.55 0 1-.45 1-1s-.45-1-1-1H2c-.55 0-1 .45-1 1s.45 1 1 1zm18 0h2c.55 0 1-.45 1-1s-.45-1-1-1h-2c-.55 0-1 .45-1 1s.45 1 1 1zM11 2v2c0 .55.45 1 1 1s1-.45 1-1V2c0-.55-.45-1-1-1s-1 .45-1 1zm0 18v2c0 .55.45 1 1 1s1-.45 1-1v-2c0-.55-.45-1-1-1s-1 .45-1 1zM5.99 4.58c-.39-.39-1.03-.39-1.41 0-.39.39-.39 1.03 0 1.41l1.06 1.06c.39.39 1.03.39 1.41 0s.39-1.03 0-1.41L5.99 4.58zm12.37 12.37c-.39-.39-1.03-.39-1.41 0-.39.39-.39 1.03 0 1.41l1.06 1.06c.39.39 1.03.39 1.41 0 .39-.39.39-1.03 0-1.41l-1.06-1.06zm1.06-10.96c.39-.39.39-1.03 0-1.41-.39-.39-1.03-.39-1.41 0l-1.06 1.06c-.39.39-.39 1.03 0 1.41s1.03.39 1.41 0l1.06-1.06zM7.05 18.36c.39-.39.39-1.03 0-1.41-.39-.39-1.03-.39-1.41 0l-1.06 1.06c-.39.39-.39 1.03 0 1.41s1.03.39 1.41 0l1.06-1.06z"/>
              </svg>
            ) : (
              <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
                <path d="M12 3c-4.97 0-9 4.03-9 9s4.03 9 9 9 9-4.03 9-9c0-.46-.04-.92-.1-1.36-.98 1.37-2.58 2.26-4.4 2.26-3.03 0-5.5-2.47-5.5-5.5 0-1.82.89-3.42 2.26-4.4-.44-.06-.9-.1-1.36-.1z"/>
              </svg>
            )}
            {darkMode ? 'Light Mode' : 'Dark Mode'}
          </button>
          <button
            onClick={toggleLogging}
            style={{
              padding: '8px 12px',
              borderRadius: '8px',
              border: `1px solid ${theme.border}`,
              backgroundColor: theme.cardBackground,
              color: loggingEnabled ? getMetricColor(90, theme) : theme.subtext,
              cursor: 'pointer',
              display: 'flex',
              alignItems: 'center',
              gap: '5px'
            }}
          >
            <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
              <path d="M14 12c0-1.1-.9-2-2-2s-2 .9-2 2 .9 2 2 2 2-.9 2-2zm-2-9c-4.97 0-9 4.03-9 9H0l4 4 4-4H5c0-3.87 3.13-7 7-7s7 3.13 7 7-3.13 7-7 7c-1.51 0-2.91-.49-4.06-1.3l-1.42 1.44C8.04 20.3 9.94 21 12 21c4.97 0 9-4.03 9-9s-4.03-9-9-9z"/>
            </svg>
            {loggingEnabled ? 'Pause Logging' : 'Resume Logging'}
          </button>
        </div>
      </div>
      
      {/* GPU Cards */}
      {data.gpus.map(gpu => {
        const memoryPercentage = (gpu.memory_used / gpu.memory_total) * 100
        const powerPercentage = Math.min((gpu.power_draw / 250) * 100, 100) // Assuming max power is 250W

        return (
          <div key={gpu.index} style={{ 
            border: `1px solid ${theme.border}`,
            padding: '20px',
            margin: '20px 0',
            borderRadius: '8px',
            backgroundColor: theme.cardBackground,
            boxShadow: '0 2px 4px rgba(0,0,0,0.1)'
          }}>
            <div style={{ 
              display: 'flex', 
              alignItems: 'center', 
              gap: '10px',
              marginBottom: '20px'
            }}>
              <h2 style={{ margin: 0, color: theme.text }}>{gpu.name}</h2>
              <div style={{ display: 'flex', gap: '10px', alignItems: 'center' }}>
                <span style={{ 
                  fontSize: '0.9rem', 
                  padding: '3px 8px', 
                  backgroundColor: theme.cardBackground,
                  border: `1px solid ${theme.border}`,
                  borderRadius: '4px',
                  color: theme.subtext
                }}>
                  GPU #{gpu.index}
                </span>
                <span style={{ 
                  fontSize: '0.9rem', 
                  padding: '3px 8px', 
                  backgroundColor: theme.cardBackground,
                  border: `1px solid ${theme.border}`,
                  borderRadius: '4px',
                  color: theme.subtext
                }}>
                  {(gpu.memory_total / 1024).toFixed(1)} GB
                </span>
              </div>
            </div>
            
            <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(200px, 1fr))', gap: '20px' }}>
              {/* GPU Stats */}
              <div style={{ display: 'flex', gap: '16px', flexWrap: 'wrap' }}>
                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>GPU Utilization</div>
                  <div 
                    role="progressbar" 
                    aria-valuenow={gpu.gpu_utilization}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} utilization: ${gpu.gpu_utilization}%`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${gpu.gpu_utilization}%`,
                      height: '100%',
                      backgroundColor: getUtilizationColor(gpu.gpu_utilization, theme),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    color: getUtilizationColor(gpu.gpu_utilization, theme)
                  }}>{gpu.gpu_utilization}%</div>
                </div>

                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>Memory Usage</div>
                  <div 
                    role="progressbar"
                    aria-valuenow={(gpu.memory_used / gpu.memory_total) * 100}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} memory usage: ${(gpu.memory_used / gpu.memory_total * 100).toFixed(1)}%`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${(gpu.memory_used / gpu.memory_total) * 100}%`,
                      height: '100%',
                      backgroundColor: getUtilizationColor((gpu.memory_used / gpu.memory_total) * 100, theme),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    display: 'flex',
                    justifyContent: 'flex-end',
                    alignItems: 'center',
                    gap: '4px'
                  }}>
                    <span style={{
                      color: getUtilizationColor((gpu.memory_used / gpu.memory_total) * 100, theme)
                    }}>
                      {(gpu.memory_used / 1024).toFixed(1)}GB
                    </span>
                    <span style={{
                      color: theme.subtext
                    }}>
                      / {(gpu.memory_total / 1024).toFixed(1)}GB
                    </span>
                  </div>
                </div>

                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>Temperature</div>
                  <div 
                    role="progressbar"
                    aria-valuenow={gpu.temperature}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} temperature: ${gpu.temperature}°C, ${Math.round(gpu.temperature * 9/5 + 32)}°F`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${Math.min((gpu.temperature / 100) * 100, 100)}%`,
                      height: '100%',
                      backgroundColor: getTemperatureColor(gpu.temperature),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    display: 'flex',
                    justifyContent: 'flex-end',
                    alignItems: 'center',
                    gap: '8px'
                  }}>
                    <span style={{ 
                      color: getTemperatureColor(gpu.temperature),
                      fontSize: '1.1em',
                      fontWeight: 500,
                      display: 'flex',
                      alignItems: 'center',
                      gap: '4px'
                    }}>
                      {Math.abs(gpu.temp_change_rate) >= 1.0 && (
                        <span style={{ 
                          color: getTemperatureIcon(gpu.temp_change_rate).color,
                          fontSize: '1.5em',
                          fontWeight: 'bold',
                          display: 'flex',
                          alignItems: 'center',
                          marginRight: '2px'
                        }}>
                          {getTemperatureIcon(gpu.temp_change_rate).icon}
                        </span>
                      )}
                      {Math.round(gpu.temperature)}°C
                    </span>
                    <span style={{ 
                      color: theme.subtext,
                      fontSize: '1.0em'
                    }}>
                      / {Math.round(gpu.temperature * 9/5 + 32)}°F
                    </span>
                    <div style={{ 
                      fontSize: '1.0rem', 
                      color: theme.subtext 
                    }}>
                      Peak: {gpu.peak_temperature}°C
                    </div>
                  </div>
                </div>

                <div style={{ flex: '1', minWidth: '150px' }}>
                  <div style={{ marginBottom: '8px', color: theme.subtext }}>Fan Speed</div>
                  <div 
                    role="progressbar"
                    aria-valuenow={gpu.fan_speed}
                    aria-valuemin={0}
                    aria-valuemax={100}
                    aria-label={`GPU ${gpu.index} fan speed: ${gpu.fan_speed}%`}
                    style={{ 
                      height: '24px', 
                      backgroundColor: theme.progressBackground,
                      borderRadius: '12px',
                      overflow: 'hidden',
                      position: 'relative'
                    }}
                  >
                    <div style={{
                      width: `${gpu.fan_speed}%`,
                      height: '100%',
                      backgroundColor: getFanSpeedColor(gpu.fan_speed, theme),
                      transition: 'all 0.3s ease-in-out',
                      position: 'relative'
                    }}>
                      <div style={{
                        position: 'absolute',
                        top: 0,
                        left: 0,
                        right: 0,
                        bottom: 0,
                        background: 'linear-gradient(90deg, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0.2) 100%)',
                        opacity: theme.isDark ? 0.1 : 0.2
                      }} />
                    </div>
                  </div>
                  <div style={{ 
                    marginTop: '4px', 
                    textAlign: 'right',
                    color: getFanSpeedColor(gpu.fan_speed, theme)
                  }}>{gpu.fan_speed}%</div>
                </div>
              </div>
            </div>
            
            {/* GPU Burn Status */}
            {data.gpu_burn_metrics.running && (
              <div style={{
                marginTop: '20px',
                padding: '15px',
                backgroundColor: theme.cardBackground,
                borderRadius: '8px',
                border: `1px solid ${theme.border}`
              }}>
                <div style={{
                  display: 'flex',
                  justifyContent: 'space-between',
                  alignItems: 'center',
                  marginBottom: '10px'
                }}>
                  <div style={{
                    display: 'flex',
                    alignItems: 'center',
                    gap: '10px'
                  }}>
                    <span style={{
                      width: '10px',
                      height: '10px',
                      borderRadius: '50%',
                      backgroundColor: '#ff4444',
                      animation: 'pulse 2s infinite'
                    }} />
                    <span style={{ fontWeight: 500 }}>GPU Burn Test Running</span>
                  </div>
                  <div style={{ color: theme.subtext }}>
                    Duration: {Math.floor(data.gpu_burn_metrics.duration / 60)}m {Math.round(data.gpu_burn_metrics.duration % 60)}s
                  </div>
                </div>
                {data.gpu_burn_metrics.errors > 0 && (
                  <div style={{ color: '#ff4444', marginTop: '10px' }}>
                    ⚠️ {data.gpu_burn_metrics.errors} computation errors detected
                  </div>
                )}
              </div>
            )}
          </div>
        )
      })}

      {/* Running Processes */}
      {data.processes.length > 0 && (
        <div style={{ 
          border: `1px solid ${theme.border}`,
          padding: '20px',
          marginTop: '20px',
          borderRadius: '8px',
          backgroundColor: theme.cardBackground,
          boxShadow: '0 1px 3px rgba(0,0,0,0.1)'
        }}>
          <h3 style={{ marginTop: 0, color: theme.text }}>Running Processes</h3>
          <div style={{ 
            display: 'grid', 
            gridTemplateColumns: 'repeat(auto-fit, minmax(300px, 1fr))', 
            gap: '10px' 
          }}>
            {data.processes
              .filter(proc => proc.name.toLowerCase() !== 'unknown')
              .map((proc, idx) => (
                <div key={`${proc.pid}-${idx}`} style={{ 
                  padding: '10px',
                  border: `1px solid ${theme.border}`,
                  borderRadius: '8px',
                  backgroundColor: theme.cardBackground
                }}>
                  <div style={{ 
                    display: 'flex', 
                    justifyContent: 'space-between',
                    alignItems: 'center'
                  }}>
                    <div>
                      <div style={{ fontWeight: '600', color: theme.text }}>{proc.name}</div>
                      <div style={{ fontSize: '12px', color: theme.subtext }}>PID: {proc.pid}</div>
                    </div>
                    <div style={{ color: theme.text }}>{proc.used_memory} MB</div>
                  </div>
                </div>
              ))}
          </div>
        </div>
      )}
    </div>
  )
}

export default App

</document_content>
</document>
<document index="66">
<source>frontend/src/components/AlertsPanel.tsx</source>
<document_content>
import React from 'react';
import {
  Paper,
  List,
  ListItem,
  ListItemText,
  Typography,
  Box,
  Chip,
  IconButton,
} from '@mui/material';
import {
  Warning as WarningIcon,
  Error as ErrorIcon,
  CheckCircle as CheckCircleIcon,
  Delete as DeleteIcon,
} from '@mui/icons-material';
import { format } from 'date-fns';

interface Alert {
  id: string;
  severity: 'warning' | 'critical' | 'resolved';
  message: string;
  timestamp: string;
  gpuIndex: number;
  metricName: string;
  value: number;
  threshold: number;
}

interface AlertsPanelProps {
  alerts: Alert[];
  onDismiss?: (alertId: string) => void;
}

const AlertsPanel: React.FC<AlertsPanelProps> = ({ alerts, onDismiss }) => {
  const getSeverityIcon = (severity: Alert['severity']) => {
    switch (severity) {
      case 'warning':
        return <WarningIcon sx={{ color: 'warning.main' }} />;
      case 'critical':
        return <ErrorIcon sx={{ color: 'error.main' }} />;
      case 'resolved':
        return <CheckCircleIcon sx={{ color: 'success.main' }} />;
    }
  };

  const getSeverityColor = (severity: Alert['severity']) => {
    switch (severity) {
      case 'warning':
        return 'warning';
      case 'critical':
        return 'error';
      case 'resolved':
        return 'success';
    }
  };

  return (
    <Paper sx={{ maxHeight: 400, overflow: 'auto', p: 2 }}>
      <Typography variant="h6" gutterBottom>
        Alerts
      </Typography>
      <List>
        {alerts.map((alert) => (
          <ListItem
            key={alert.id}
            sx={{
              mb: 1,
              border: 1,
              borderColor: 'divider',
              borderRadius: 1,
            }}
            secondaryAction={
              onDismiss && (
                <IconButton
                  edge="end"
                  aria-label="dismiss"
                  onClick={() => onDismiss(alert.id)}
                >
                  <DeleteIcon />
                </IconButton>
              )
            }
          >
            <Box sx={{ mr: 2 }}>{getSeverityIcon(alert.severity)}</Box>
            <ListItemText
              primary={
                <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
                  <Typography variant="body1">{alert.message}</Typography>
                  <Chip
                    label={`GPU ${alert.gpuIndex}`}
                    size="small"
                    color={getSeverityColor(alert.severity)}
                    variant="outlined"
                  />
                </Box>
              }
              secondary={
                <>
                  <Typography variant="body2" color="text.secondary">
                    {format(new Date(alert.timestamp), 'MMM d, yyyy HH:mm:ss')}
                  </Typography>
                  <Typography variant="body2" color="text.secondary">
                    {alert.metricName}: {alert.value} (threshold: {alert.threshold})
                  </Typography>
                </>
              }
            />
          </ListItem>
        ))}
        {alerts.length === 0 && (
          <ListItem>
            <ListItemText
              primary={
                <Typography color="text.secondary">No active alerts</Typography>
              }
            />
          </ListItem>
        )}
      </List>
    </Paper>
  );
};

export default AlertsPanel;

</document_content>
</document>
<document index="67">
<source>frontend/src/components/MetricsGrid.tsx</source>
<document_content>
import React from 'react';
import {
  Grid,
  Paper,
  Typography,
  LinearProgress,
  Box,
  IconButton,
} from '@mui/material';
import {
  Memory as MemoryIcon,
  Speed as SpeedIcon,
  Thermostat as ThermostatIcon,
  Settings as SettingsIcon,
} from '@mui/icons-material';

interface GPUMetrics {
  index: number;
  name: string;
  utilization: number;
  memoryUsed: number;
  memoryTotal: number;
  temperature: number;
  fanSpeed: number;
  powerDraw: number;
  powerLimit: number;
}

interface MetricsGridProps {
  gpus: GPUMetrics[];
  onConfigureGPU?: (gpuIndex: number) => void;
}

const MetricsGrid: React.FC<MetricsGridProps> = ({ gpus, onConfigureGPU }) => {
  const getUtilizationColor = (value: number) => {
    if (value < 50) return 'success.main';
    if (value < 80) return 'warning.main';
    return 'error.main';
  };

  const getTemperatureColor = (value: number) => {
    if (value < 60) return 'success.main';
    if (value < 80) return 'warning.main';
    return 'error.main';
  };

  return (
    <Grid container spacing={3}>
      {gpus.map((gpu) => (
        <Grid item xs={12} md={6} lg={4} key={gpu.index}>
          <Paper sx={{ p: 2 }}>
            <Box sx={{ display: 'flex', justifyContent: 'space-between', mb: 2 }}>
              <Typography variant="h6">
                GPU {gpu.index}: {gpu.name}
              </Typography>
              {onConfigureGPU && (
                <IconButton
                  size="small"
                  onClick={() => onConfigureGPU(gpu.index)}
                >
                  <SettingsIcon />
                </IconButton>
              )}
            </Box>

            {/* Utilization */}
            <Box sx={{ mb: 2 }}>
              <Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
                <SpeedIcon sx={{ mr: 1 }} />
                <Typography variant="body2">Utilization</Typography>
              </Box>
              <Box sx={{ display: 'flex', alignItems: 'center' }}>
                <Box sx={{ flexGrow: 1, mr: 1 }}>
                  <LinearProgress
                    variant="determinate"
                    value={gpu.utilization}
                    sx={{
                      height: 10,
                      borderRadius: 5,
                      backgroundColor: 'action.hover',
                      '& .MuiLinearProgress-bar': {
                        backgroundColor: getUtilizationColor(gpu.utilization),
                      },
                    }}
                  />
                </Box>
                <Typography variant="body2">{gpu.utilization}%</Typography>
              </Box>
            </Box>

            {/* Memory */}
            <Box sx={{ mb: 2 }}>
              <Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
                <MemoryIcon sx={{ mr: 1 }} />
                <Typography variant="body2">Memory</Typography>
              </Box>
              <Box sx={{ display: 'flex', alignItems: 'center' }}>
                <Box sx={{ flexGrow: 1, mr: 1 }}>
                  <LinearProgress
                    variant="determinate"
                    value={(gpu.memoryUsed / gpu.memoryTotal) * 100}
                    sx={{
                      height: 10,
                      borderRadius: 5,
                    }}
                  />
                </Box>
                <Typography variant="body2">
                  {gpu.memoryUsed}/{gpu.memoryTotal} GB
                </Typography>
              </Box>
            </Box>

            {/* Temperature */}
            <Box sx={{ mb: 2 }}>
              <Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
                <ThermostatIcon sx={{ mr: 1 }} />
                <Typography variant="body2">Temperature</Typography>
              </Box>
              <Box sx={{ display: 'flex', alignItems: 'center' }}>
                <Box sx={{ flexGrow: 1, mr: 1 }}>
                  <LinearProgress
                    variant="determinate"
                    value={(gpu.temperature / 100) * 100}
                    sx={{
                      height: 10,
                      borderRadius: 5,
                      backgroundColor: 'action.hover',
                      '& .MuiLinearProgress-bar': {
                        backgroundColor: getTemperatureColor(gpu.temperature),
                      },
                    }}
                  />
                </Box>
                <Typography variant="body2">{gpu.temperature}°C</Typography>
              </Box>
            </Box>

            {/* Power Usage */}
            <Box>
              <Typography variant="body2" color="text.secondary">
                Power: {gpu.powerDraw}W / {gpu.powerLimit}W
              </Typography>
              <Typography variant="body2" color="text.secondary">
                Fan Speed: {gpu.fanSpeed}%
              </Typography>
            </Box>
          </Paper>
        </Grid>
      ))}
    </Grid>
  );
};

export default MetricsGrid;

</document_content>
</document>
<document index="68">
<source>frontend/src/components/TimeSeriesChart.tsx</source>
<document_content>
import React from 'react';
import {
  LineChart,
  Line,
  XAxis,
  YAxis,
  CartesianGrid,
  Tooltip,
  Legend,
  ResponsiveContainer,
} from 'recharts';
import { format } from 'date-fns';
import { Box, Typography, useTheme } from '@mui/material';

interface TimeSeriesProps {
  data: Array<{
    timestamp: string;
    value: number;
  }>;
  title: string;
  dataKey: string;
  color?: string;
  unit?: string;
}

const TimeSeriesChart: React.FC<TimeSeriesProps> = ({
  data,
  title,
  dataKey,
  color = '#8884d8',
  unit = '',
}) => {
  const theme = useTheme();

  const formatXAxis = (tickItem: string) => {
    return format(new Date(tickItem), 'HH:mm:ss');
  };

  const formatTooltip = (value: number) => {
    return `${value}${unit}`;
  };

  return (
    <Box sx={{ width: '100%', height: 300, p: 2 }}>
      <Typography variant="h6" gutterBottom>
        {title}
      </Typography>
      <ResponsiveContainer>
        <LineChart
          data={data}
          margin={{
            top: 5,
            right: 30,
            left: 20,
            bottom: 5,
          }}
        >
          <CartesianGrid strokeDasharray="3 3" />
          <XAxis
            dataKey="timestamp"
            tickFormatter={formatXAxis}
            stroke={theme.palette.text.primary}
          />
          <YAxis stroke={theme.palette.text.primary} />
          <Tooltip
            labelFormatter={(label: string) => format(new Date(label), 'HH:mm:ss')}
            formatter={(value: number) => [formatTooltip(value), title]}
          />
          <Legend />
          <Line
            type="monotone"
            dataKey={dataKey}
            stroke={color}
            dot={false}
            activeDot={{ r: 8 }}
          />
        </LineChart>
      </ResponsiveContainer>
    </Box>
  );
};

export default TimeSeriesChart;

</document_content>
</document>
<document index="69">
<source>frontend/src/index.css</source>
<document_content>
:root {
  font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
  line-height: 1.5;
  font-weight: 400;

  font-synthesis: none;
  text-rendering: optimizeLegibility;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

body {
  margin: 0;
  min-width: 320px;
  min-height: 100vh;
}

#root {
  width: 100%;
  min-height: 100vh;
}

@keyframes spin {
  to { transform: rotate(360deg); }
}

@keyframes pulse {
  0% { opacity: 1; }
  50% { opacity: 0.5; }
  100% { opacity: 1; }
}

</document_content>
</document>
<document index="70">
<source>frontend/src/main.tsx</source>
<document_content>
import React from 'react'
import ReactDOM from 'react-dom/client'
import App from './App'

ReactDOM.createRoot(document.getElementById('root')!).render(
  <React.StrictMode>
    <App />
  </React.StrictMode>
)

</document_content>
</document>
<document index="71">
<source>frontend/src/vite-env.d.ts</source>
<document_content>
/// <reference types="vite/client" />

</document_content>
</document>
<document index="72">
<source>frontend/stop_frontend.sh</source>
<document_content>
#!/bin/bash

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PID_FILE="$DIR/frontend.pid"

if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p $PID > /dev/null; then
        echo "Stopping Frontend (PID: $PID)"
        kill $PID
        rm "$PID_FILE"
    else
        echo "Frontend not running (stale PID file)"
        rm "$PID_FILE"
    fi
else
    echo "No PID file found"
fi
</document_content>
</document>
<document index="73">
<source>frontend/tsconfig.app.json</source>
<document_content>
{
  "extends": "./tsconfig.json",
  "compilerOptions": {
    "composite": true,
    "outDir": "./dist/app",
    "declaration": true,
    "declarationDir": "./dist/app/types",
    "rootDir": "src",
    "target": "ES2020",
    "useDefineForClassFields": true,
    "lib": ["ES2020", "DOM", "DOM.Iterable"],
    "module": "ESNext",
    "skipLibCheck": true,
    "moduleResolution": "bundler",
    "allowImportingTsExtensions": true,
    "resolveJsonModule": true,
    "isolatedModules": true,
    "noEmit": false,
    "jsx": "react-jsx",
    "strict": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "noFallthroughCasesInSwitch": true
  },
  "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.d.ts"],
  "exclude": ["src/**/__tests__/*"],
  "references": [{ "path": "./tsconfig.node.json" }]
}

</document_content>
</document>
<document index="74">
<source>frontend/tsconfig.json</source>
<document_content>
{
  "compilerOptions": {
    "target": "ES2020",
    "lib": ["ES2020", "DOM", "DOM.Iterable"],
    "module": "ESNext",
    "skipLibCheck": true,
    "moduleResolution": "bundler",
    "allowImportingTsExtensions": true,
    "resolveJsonModule": true,
    "isolatedModules": true,
    "jsx": "react-jsx",
    "strict": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "noFallthroughCasesInSwitch": true,
    "declaration": true,
    "declarationDir": "./dist/types",
    "emitDeclarationOnly": true
  },
  "include": ["src"],
  "exclude": ["node_modules"],
  "references": [
    { "path": "./tsconfig.app.json" },
    { "path": "./tsconfig.node.json" }
  ]
}

</document_content>
</document>
<document index="75">
<source>frontend/tsconfig.node.json</source>
<document_content>
{
  "compilerOptions": {
    "composite": true,
    "skipLibCheck": true,
    "module": "ESNext",
    "moduleResolution": "bundler",
    "allowSyntheticDefaultImports": true,
    "strict": true,
    "outDir": "./dist/node",
    "declaration": true,
    "declarationDir": "./dist/node/types"
  },
  "include": ["vite.config.ts"]
}

</document_content>
</document>
<document index="76">
<source>frontend/vite.config.ts</source>
<document_content>
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'

export default defineConfig({
  plugins: [react()],
  server: {
    port: 5175,
    host: true,
    hmr: false
  }
})

</document_content>
</document>
<document index="77">
<source>frontend.pid</source>
<document_content>
579501

</document_content>
</document>
<document index="78">
<source>requirements.txt</source>
<document_content>
supabase==1.0.3
python-dotenv==1.0.0
pydantic==2.5.1
pytest==7.4.3
requests==2.32.3
psutil>=5.9.0

</document_content>
</document>
<document index="79">
<source>run.sh</source>
<document_content>
#!/bin/bash

# Function to check if a port is in use
check_port() {
    if lsof -Pi :$1 -sTCP:LISTEN -t >/dev/null ; then
        echo "Port $1 is already in use"
        return 1
    fi
    return 0
}

# Function to kill process on port
kill_port() {
    if lsof -Pi :$1 -sTCP:LISTEN -t >/dev/null ; then
        echo "Killing process on port $1..."
        lsof -ti:$1 | xargs kill -9
    fi
}

# Check if virtual environment exists
if [ ! -d "venv" ]; then
    echo "Creating virtual environment..."
    python3 -m venv venv
fi

# Activate virtual environment
source venv/bin/activate

# Install backend requirements
echo "Installing backend requirements..."
pip install -r requirements.txt

# Install frontend dependencies if needed
if [ ! -d "frontend/node_modules" ]; then
    echo "Installing frontend dependencies..."
    cd frontend && npm install && cd ..
fi

# Kill any existing processes on our ports
kill_port 5183  # FastAPI
kill_port 5173  # Vite dev server

# Start all components in background
echo "Starting GPU stats collector..."
python -m src.collector.collector &

echo "Starting FastAPI server..."
cd backend && python -m src.service.app &

echo "Starting frontend dev server..."
cd frontend && npm run dev &

# Wait for servers to start
sleep 3

echo "
🚀 GPU Sentinel Pro is running!

📊 Frontend: http://localhost:5173
🔧 API & Docs: http://localhost:5183
📘 API Documentation:
   - Swagger UI: http://localhost:5183/docs
   - ReDoc: http://localhost:5183/redoc

Press Ctrl+C to stop all services
"

# Wait for Ctrl+C
trap 'kill $(jobs -p)' INT
wait
</document_content>
</document>
<document index="80">
<source>sonar-project.properties</source>
<document_content>
# Project information
sonar.projectKey=jackccrawford_gpu-sentinel-pro
sonar.organization=jackccrawford
sonar.projectName=GPU Sentinel Pro
sonar.projectVersion=1.0

# Source code location
sonar.sources=backend/src,frontend/src
sonar.tests=backend/tests,frontend/src/**/*.test.tsx

# Language settings
sonar.python.version=3.11
sonar.typescript.node=20

# Coverage reports
sonar.python.coverage.reportPaths=backend/coverage-reports/coverage.xml
sonar.javascript.lcov.reportPaths=frontend/coverage/lcov.info

# Encoding of source files
sonar.sourceEncoding=UTF-8

# Analysis settings
sonar.exclusions=**/node_modules/**,**/*.pyc,**/__pycache__/**
sonar.coverage.exclusions=**/*.test.tsx,**/*.spec.ts,**/tests/**

# Quality gate settings
sonar.qualitygate.wait=true
</document_content>
</document>
<document index="81">
<source>src/__init__.py</source>
<document_content>

</document_content>
</document>
<document index="82">
<source>src/collector/__init__.py</source>
<document_content>

</document_content>
</document>
<document index="83">
<source>src/collector/collector.py</source>
<document_content>
import requests
import time
from datetime import datetime
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))

from src.models.gpu_metrics import GpuMetricsRecord
from src.database.client import supabase

class GpuStatsCollector:
    def __init__(self, api_url="http://localhost:5000/api/gpu-stats", interval=0.25):
        self.api_url = api_url
        self.interval = interval

    def fetch_stats(self):
        """Fetch GPU stats from the local API"""
        try:
            response = requests.get(self.api_url)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            print(f"Error fetching GPU stats: {e}")
            return None

    def collect_and_store(self):
        """Fetch stats and store them in Supabase"""
        data = self.fetch_stats()
        if data:
            try:
                metrics = GpuMetricsRecord(**data)
                result = supabase.insert_gpu_metrics(metrics)
                return result
            except Exception as e:
                print(f"Error storing metrics: {e}")
                return None

    def run_collector(self):
        """Run the collector continuously"""
        print(f"Starting GPU stats collection every {self.interval} seconds...")
        while True:
            try:
                self.collect_and_store()
                time.sleep(self.interval)
            except KeyboardInterrupt:
                print("\nStopping GPU stats collection...")
                break
            except Exception as e:
                print(f"Unexpected error: {e}")
                time.sleep(self.interval)

def main():
    collector = GpuStatsCollector()
    collector.run_collector()

if __name__ == "__main__":
    main()
</document_content>
</document>
<document index="84">
<source>supabase/.env.supabase</source>
<document_content>
POSTGRES_PASSWORD=Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
JWT_SECRET=Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u
OPERATOR_TOKEN=PIAh8fRfFPnd70DYRSuVshTI6NXNraAj

</document_content>
</document>
<document index="85">
<source>supabase/config.toml</source>
<document_content>
# A string used to distinguish different Supabase projects on the same host. Defaults to the
# working directory name when running `supabase init`.
project_id = "exponent-project-01"

[api]
enabled = true
# Port to use for the API URL.
port = 54321
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
# endpoints. public and storage are always included.
schemas = ["public", "storage", "graphql_public"]
# Extra schemas to add to the search_path of every request. public is always included.
extra_search_path = ["public", "extensions"]
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
# for accidental or malicious requests.
max_rows = 1000

[db]
# Port to use for the local database URL.
port = 54322
# Port used by db diff command to initialize the shadow database.
shadow_port = 54320
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
# server_version;` on the remote database to check.
major_version = 15

[db.pooler]
enabled = false
# Port to use for the local connection pooler.
port = 54329
# Specifies when a server connection can be reused by other clients.
# Configure one of the supported pooler modes: `transaction`, `session`.
pool_mode = "transaction"
# How many server connections to allow per user/database pair.
default_pool_size = 20
# Maximum number of client connections allowed.
max_client_conn = 100

[realtime]
enabled = true
# Bind realtime via either IPv4 or IPv6. (default: IPv6)
# ip_version = "IPv6"
# The maximum length in bytes of HTTP request headers. (default: 4096)
# max_header_length = 4096

[studio]
enabled = true
# Port to use for Supabase Studio.
port = 54323
# External URL of the API server that frontend connects to.
api_url = "http://127.0.0.1"

# Email testing server. Emails sent with the local dev setup are not actually sent - rather, they
# are monitored, and you can view the emails that would have been sent from the web interface.
[inbucket]
enabled = true
# Port to use for the email testing server web interface.
port = 54324
# Uncomment to expose additional ports for testing user applications that send emails.
# smtp_port = 54325
# pop3_port = 54326

[storage]
enabled = true
# The maximum file size allowed (e.g. "5MB", "500KB").
file_size_limit = "50MiB"

[auth]
enabled = true
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
# in emails.
site_url = "http://127.0.0.1:3000"
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
additional_redirect_urls = ["https://127.0.0.1:3000"]
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
jwt_expiry = 3600
# If disabled, the refresh token will never expire.
enable_refresh_token_rotation = true
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
# Requires enable_refresh_token_rotation = true.
refresh_token_reuse_interval = 10
# Allow/disallow new user signups to your project.
enable_signup = true
# Allow/disallow testing manual linking of accounts
enable_manual_linking = false

[auth.email]
# Allow/disallow new user signups via email to your project.
enable_signup = true
# If enabled, a user will be required to confirm any email change on both the old, and new email
# addresses. If disabled, only the new email is required to confirm.
double_confirm_changes = true
# If enabled, users need to confirm their email address before signing in.
enable_confirmations = false

# Uncomment to customize email template
# [auth.email.template.invite]
# subject = "You have been invited"
# content_path = "./supabase/templates/invite.html"

[auth.sms]
# Allow/disallow new user signups via SMS to your project.
enable_signup = true
# If enabled, users need to confirm their phone number before signing in.
enable_confirmations = false
# Template for sending OTP to users
template = "Your code is {{ .Code }} ."

# Use pre-defined map of phone number to OTP for testing.
[auth.sms.test_otp]
# 4152127777 = "123456"

# This hook runs before a token is issued and allows you to add additional claims based on the authentication method used.
[auth.hook.custom_access_token]
# enabled = true
# uri = "pg-functions://<database>/<schema>/<hook_name>"


# Configure one of the supported SMS providers: `twilio`, `twilio_verify`, `messagebird`, `textlocal`, `vonage`.
[auth.sms.twilio]
enabled = false
account_sid = ""
message_service_sid = ""
# DO NOT commit your Twilio auth token to git. Use environment variable substitution instead:
auth_token = "env(SUPABASE_AUTH_SMS_TWILIO_AUTH_TOKEN)"

# Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`,
# `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin_oidc`, `notion`, `twitch`,
# `twitter`, `slack`, `spotify`, `workos`, `zoom`.
[auth.external.apple]
enabled = false
client_id = ""
# DO NOT commit your OAuth provider secret to git. Use environment variable substitution instead:
secret = "env(SUPABASE_AUTH_EXTERNAL_APPLE_SECRET)"
# Overrides the default auth redirectUrl.
redirect_uri = ""
# Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure,
# or any other third-party OIDC providers.
url = ""

[analytics]
enabled = false
port = 54327
vector_port = 54328
# Configure one of the supported backends: `postgres`, `bigquery`.
backend = "postgres"

# Experimental features may be deprecated any time
[experimental]
# Configures Postgres storage engine to use OrioleDB (S3)
orioledb_version = ""
# Configures S3 bucket URL, eg. <bucket_name>.s3-<region>.amazonaws.com
s3_host = "env(S3_HOST)"
# Configures S3 bucket region, eg. us-east-1
s3_region = "env(S3_REGION)"
# Configures AWS_ACCESS_KEY_ID for S3 bucket
s3_access_key = "env(S3_ACCESS_KEY)"
# Configures AWS_SECRET_ACCESS_KEY for S3 bucket
s3_secret_key = "env(S3_SECRET_KEY)"

</document_content>
</document>
<document index="86">
<source>supabase/docker-compose.simple.yml</source>
<document_content>
version: '3.8'
services:
  postgres:
    image: postgres:15
    ports:
      - "54432:5432"
    environment:
      POSTGRES_PASSWORD: postgres
      POSTGRES_DB: postgres
    volumes:
      - pg_data:/var/lib/postgresql/data
      - ./init:/docker-entrypoint-initdb.d
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

volumes:
  pg_data:

</document_content>
</document>
<document index="87">
<source>supabase/docker-compose.yml</source>
<document_content>
version: '3.8'
services:
  postgres:
    image: supabase/postgres:latest
    ports:
      - "54432:5432"
    environment:
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      POSTGRES_DB: postgres
    volumes:
      - pg_data:/var/lib/postgresql/data
      - ./init:/docker-entrypoint-initdb.d
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

  studio:
    image: supabase/studio:latest
    ports:
      - "54000:3000"
    environment:
      STUDIO_PG_META_URL: http://meta:8080
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      DEFAULT_ORGANIZATION_NAME: "GPU Metrics Monitor"
      SUPABASE_URL: http://localhost:54001
    depends_on:
      - postgres
      - kong

  kong:
    image: kong:latest
    ports:
      - "54001:8000"
      - "54443:8443"
    environment:
      KONG_DATABASE: "off"
      KONG_DECLARATIVE_CONFIG: /var/lib/kong/kong.yml
      KONG_DNS_ORDER: LAST,A,CNAME
      KONG_PLUGINS: request-transformer,cors,key-auth,acl
    volumes:
      - ./kong.yml:/var/lib/kong/kong.yml:ro

  auth:
    image: supabase/gotrue:latest
    ports:
      - "54002:9999"
    environment:
      GOTRUE_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
      GOTRUE_JWT_EXP: 3600
      GOTRUE_DB_DRIVER: postgres
      DATABASE_URL: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres?sslmode=disable"
      API_EXTERNAL_URL: http://localhost:54001
      SITE_URL: http://localhost:54000
      OPERATOR_TOKEN: "PIAh8fRfFPnd70DYRSuVshTI6NXNraAj"
    depends_on:
      - postgres

  rest:
    image: postgrest/postgrest:latest
    ports:
      - "54003:3000"
    environment:
      PGRST_DB_URI: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres"
      PGRST_DB_SCHEMA: public
      PGRST_DB_ANON_ROLE: anon
      PGRST_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
    depends_on:
      - postgres

  meta:
    image: supabase/postgres-meta:latest
    ports:
      - "54004:8080"
    environment:
      PG_META_PORT: 8080
      PG_META_DB_HOST: postgres
      PG_META_DB_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
    depends_on:
      - postgres

volumes:
  pg_data:

</document_content>
</document>
<document index="88">
<source>supabase/docker-compose.yml.bak</source>
<document_content>
version: '3.8'
services:
  postgres:
    image: supabase/postgres:15.1.0.117
    ports:
      - "54432:5432"  # High port for Postgres
    environment:
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      POSTGRES_DB: postgres
    volumes:
      - pg_data:/var/lib/postgresql/data
      - ./init:/docker-entrypoint-initdb.d
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

  studio:
    image: supabase/studio:20240205-9d2d574
    ports:
      - "54000:3000"  # High port for Studio
    environment:
      STUDIO_PG_META_URL: http://meta:8080
      POSTGRES_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
      DEFAULT_ORGANIZATION_NAME: "GPU Metrics Monitor"
      SUPABASE_URL: http://localhost:54001  # Match new Kong port
    depends_on:
      - postgres
      - kong

  kong:
    image: kong:2.8.1
    ports:
      - "54001:8000"  # High port for Kong HTTP
      - "54443:8443"  # High port for Kong HTTPS
    environment:
      KONG_DATABASE: "off"
      KONG_DECLARATIVE_CONFIG: /var/lib/kong/kong.yml
      KONG_DNS_ORDER: LAST,A,CNAME
      KONG_PLUGINS: request-transformer,cors,key-auth,acl
    volumes:
      - ./kong.yml:/var/lib/kong/kong.yml:ro

  auth:
    image: supabase/gotrue:v2.132.3
    ports:
      - "54002:9999"  # High port for Auth
    environment:
      GOTRUE_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
      GOTRUE_JWT_EXP: 3600
      GOTRUE_DB_DRIVER: postgres
      DATABASE_URL: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres?sslmode=disable"
      API_EXTERNAL_URL: http://localhost:54001  # Match new Kong port
      SITE_URL: http://localhost:54000  # Match new Studio port
      OPERATOR_TOKEN: "your-super-secret-operator-token"
    depends_on:
      - postgres

  rest:
    image: postgrest/postgrest:v11.2.0
    ports:
      - "54003:3000"  # High port for REST
    environment:
      PGRST_DB_URI: "postgres://postgres:Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY@postgres:5432/postgres"
      PGRST_DB_SCHEMA: public
      PGRST_DB_ANON_ROLE: anon
      PGRST_JWT_SECRET: "Rp0rKVWpTj8trLDaZZTPS58ppVd8C07u"
    depends_on:
      - postgres

  meta:
    image: supabase/postgres-meta:v0.68.0
    ports:
      - "54004:8080"  # High port for Meta
    environment:
      PG_META_PORT: 8080
      PG_META_DB_HOST: postgres
      PG_META_DB_PASSWORD: Nc4wMosD0voRA7iSuNHKmzfdFsXyC1TY
    depends_on:
      - postgres

volumes:
  pg_data:

</document_content>
</document>
<document index="89">
<source>supabase/init/00-init.sql</source>
<document_content>
-- Enable necessary extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

-- Create the gpu_metrics table
CREATE TABLE IF NOT EXISTS gpu_metrics (
    id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
    timestamp timestamptz NOT NULL DEFAULT now(),
    
    -- GPU Burn Metrics
    duration integer NOT NULL,
    errors integer NOT NULL,
    running boolean NOT NULL,
    
    -- Nvidia Info
    cuda_version text NOT NULL,
    driver_version text NOT NULL,
    
    -- GPU Metrics Array (stored as JSONB)
    gpus jsonb NOT NULL,
    
    -- Additional fields
    processes jsonb DEFAULT '[]'::jsonb,
    success boolean NOT NULL,
    
    -- Indexes for common queries
    created_at timestamptz NOT NULL DEFAULT now()
);

-- Create indexes for better query performance
CREATE INDEX IF NOT EXISTS idx_gpu_metrics_timestamp ON gpu_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_gpu_metrics_created_at ON gpu_metrics(created_at);

-- Set up row level security (RLS)
ALTER TABLE gpu_metrics ENABLE ROW LEVEL SECURITY;

-- Create a policy that allows all operations for now (we can restrict this later)
CREATE POLICY "Allow all operations on gpu_metrics" 
    ON gpu_metrics 
    FOR ALL 
    USING (true) 
    WITH CHECK (true);

-- Add a comment to the table
COMMENT ON TABLE gpu_metrics IS 'Stores GPU metrics data collected from NVIDIA GPUs';
</document_content>
</document>
<document index="90">
<source>supabase/kong.yml</source>
<document_content>
_format_version: "2.1"
_transform: true

services:
  - name: auth-v1
    url: http://auth:9999
    routes:
      - name: auth-v1-route
        paths:
          - /auth/v1
    plugins:
      - name: cors
  
  - name: rest-v1
    url: http://rest:3000
    routes:
      - name: rest-v1-route
        paths:
          - /rest/v1
    plugins:
      - name: cors

  - name: meta
    url: http://meta:8080
    routes:
      - name: meta-route
        paths:
          - /pg
    plugins:
      - name: cors
</document_content>
</document>
<document index="91">
<source>supabase/seed.sql</source>
<document_content>

</document_content>
</document>
<document index="92">
<source>supabase/start.sh</source>
<document_content>
#!/bin/bash

# Function to check if Docker is running
check_docker() {
    if ! docker info >/dev/null 2>&1; then
        echo "Error: Docker is not running"
        exit 1
    fi
}

# Function to generate a secure random string
generate_secret() {
    openssl rand -base64 32 | tr -dc 'a-zA-Z0-9' | head -c 32
}

# Function to update docker-compose.yml with secure credentials
update_credentials() {
    local pg_password=$(generate_secret)
    local jwt_secret=$(generate_secret)
    local operator_token=$(generate_secret)

    # Create credentials file
    cat > .env.supabase << EOF
POSTGRES_PASSWORD=$pg_password
JWT_SECRET=$jwt_secret
OPERATOR_TOKEN=$operator_token
EOF

    # Update docker-compose.yml with the new credentials
    sed -i.bak "s/your-super-secret-password/$pg_password/g" docker-compose.yml
    sed -i.bak "s/your-super-secret-jwt-token-with-at-least-32-characters/$jwt_secret/g" docker-compose.yml
    sed -i.bak "s/your-super-secret-operator-token/$operator_token/g" docker-compose.yml

    echo "Credentials saved to .env.supabase"
    chmod 600 .env.supabase
}

# Main script
echo "Starting Supabase local setup..."

# Check Docker
check_docker

# Create init directory if it doesn't exist
mkdir -p init

# Generate credentials if they don't exist
if [ ! -f .env.supabase ]; then
    echo "Generating secure credentials..."
    update_credentials
fi

# Start services
echo "Starting Supabase services..."
docker-compose up -d

echo "
Supabase is starting! The following services will be available:
- Studio:  http://localhost:54000
- API:     http://localhost:54001
- Auth:    http://localhost:54002
- REST:    http://localhost:54003
- Meta:    http://localhost:54004
- DB:      localhost:54432

Credentials are stored in .env.supabase
"

# Wait for services to be healthy
echo "Waiting for services to be ready..."
sleep 10

echo "Setup complete! You can now access Supabase Studio at http://localhost:54000"
</document_content>
</document>
<document index="93">
<source>supabase/stop.sh</source>
<document_content>
#!/bin/bash

echo "Stopping Supabase services..."

# Stop all containers
docker-compose down

echo "Supabase services stopped."

# Optional cleanup flag
if [ "$1" == "--clean" ]; then
    echo "Cleaning up volumes..."
    docker-compose down -v
    rm -f .env.supabase docker-compose.yml.bak
    echo "Cleanup complete. All data has been removed."
fi
</document_content>
</document>
</documents>

💡 Tip: Click in the text area and press Ctrl+A (Cmd+A on Mac) to select all, then Ctrl+C (Cmd+C) to copy.