{
  "ok": true,
  "report": {
    "id": "catalyst-q-competitive-benchmark-report",
    "updated_at": "2026-05-27",
    "methodology": [
      "Separate local synthetic eval signals from public benchmark proof.",
      "Name incumbent platforms and open baselines so buyers see exactly what Catalyst-Q must beat or complement.",
      "Use inspection policies as product guardrails before marketing or sales language is promoted.",
      "Prefer recorded baseline fixtures first, then pinned live solver/simulator runners in CI."
    ],
    "positioning": {
      "headline": "The verification layer for high-stakes scientific and operational AI decisions.",
      "category_design": "Evidence-first verification layer: Catalyst-Q is the system of record for expensive scientific, routing, grid, flight, and safety decisions that teams need to trust before they act.",
      "buyer_promise": "Bring us the disputed catalyst, material, route plan, dispatch scenario, or safety decision; Catalyst-Q returns a replayable proof packet with baselines, assumptions, ROI math, and the next evidence gate.",
      "wedge_sequence": [
        "Make verify() the core SDK primitive so every agent response can produce a consistency check, digest, replay id, and human-readable proof packet.",
        "Use open-source and incumbent tools as named baselines instead of pretending buyers will abandon them immediately.",
        "Sell verification packets and paid pilots first; expand into optimization automation only after benchmark and customer replay evidence is recorded.",
        "Keep Catalyst Brain as the memory/evidence orchestrator and Catalyst-Q as the scoped solver/verifier behind each packet."
      ],
      "core_rhetoric": [
        "Prove the decision before you automate it.",
        "Independent verification for black-box scientific and operational AI.",
        "From candidate generation to evidence-backed go/no-go.",
        "Buyer-safe proof packets for teams that cannot afford a wrong answer."
      ]
    },
    "battlecards": [
      {
        "segment": "Formal verification",
        "competitors": [
          "Cajal Technologies",
          "Lean/Coq/Isabelle-based proof-agent stacks"
        ],
        "competitor_strength": "They own the hot proof-certificate narrative: AI agents produce machine-checkable mathematical or software correctness artifacts.",
        "catalyst_q_angle": "Specialize in scientific result verification and operational proof packets, not compiled-binary correctness: verify chemistry scope, solver baselines, replay ids, and ROI evidence.",
        "how_to_crush": [
          "Adopt proof-certificate language around packet.verify() while keeping the proof scoped to scientific consistency and replay.",
          "Publish examples where the packet catches missing active-space metadata, broken charge/spin assumptions, or unproven ROI claims.",
          "Make verification understandable to a VP R&D or VP Ops without requiring them to read proof-assistant code."
        ],
        "proof_language": "Say Catalyst-Q produces scoped, replayable consistency proof packets that make scientific and operational decisions easier to trust.",
        "proof_to_publish_next": "Tamper-evident packet demo: original verifies, modified active space or energy claim fails."
      },
      {
        "segment": "Quantum chemistry platforms",
        "competitors": [
          "Quantinuum InQuanto",
          "QunaSys Qamuy",
          "Phasecraft",
          "Qiskit Nature"
        ],
        "competitor_strength": "They have deep quantum-algorithm credibility, hardware/cloud ecosystems, and researcher trust.",
        "catalyst_q_angle": "Win the active-space verification workflow around disputed results: parse customer assumptions, run scoped checks, compare PySCF/OpenFermion-style references, and emit an auditable proof packet.",
        "how_to_crush": [
          "Benchmark small active-space systems against PySCF/OpenFermion/Psi4 and publish exact fixtures.",
          "Package each result as a customer-ready report instead of only a notebook or circuit artifact.",
          "Make the API feel like procurement-safe verification: inputs, assumptions, proof, evidence scope, scientist review."
        ],
        "proof_language": "Say Catalyst-Q verifies declared active-space packets with references, replay ids, consistency checks, and scientist-ready reports.",
        "proof_to_publish_next": "Exact-chemistry benchmark packet set for small molecules and transition-metal fragments."
      },
      {
        "segment": "AI materials discovery",
        "competitors": [
          "CuspAI",
          "Orbital Materials",
          "Periodic Labs",
          "Microsoft MatterGen/MatterSim"
        ],
        "competitor_strength": "They are funded and talent-dense candidate-generation engines with strong AI-for-science narratives.",
        "catalyst_q_angle": "Verify the top 0.1% candidates from generated pipelines before wet-lab spend, partner review, or investor diligence.",
        "how_to_crush": [
          "Offer verification-as-a-service to materials teams that already have candidate generators.",
          "Show how a packet reduces false-positive wet-lab spend and flags DFT-sensitive assumptions.",
          "Integrate as an MCP/API verifier that an AI scientist can call before promoting a candidate."
        ],
        "proof_language": "Say Catalyst-Q is the go/no-go evidence layer that helps materials teams promote fewer, stronger candidates.",
        "proof_to_publish_next": "Candidate triage demo: generated material enters, verification packet ranks evidence gaps and go/no-go confidence."
      },
      {
        "segment": "Enterprise AQ / physics AI",
        "competitors": [
          "SandboxAQ",
          "Schrodinger",
          "XtalPi",
          "Iambic"
        ],
        "competitor_strength": "They bring enterprise trust, domain PhDs, proprietary datasets, and pharma/materials partnerships.",
        "catalyst_q_angle": "Attach an independent proof and replay layer to high-value disputed outputs, especially where the buyer needs a second opinion before committing capital.",
        "how_to_crush": [
          "Lead with narrow paid verification packets instead of trying to match enterprise platform breadth.",
          "Use transparent baselines and claim ledgers as a trust wedge against broad black-box platforms.",
          "Target smaller teams inside large enterprises who need a fast external challenge report."
        ],
        "proof_language": "Say Catalyst-Q is the independent challenge report that gives enterprise teams a faster second opinion on costly scientific decisions.",
        "proof_to_publish_next": "Third-party-style report comparing customer approximate methods against a Catalyst-Q verification packet."
      },
      {
        "segment": "Freight route optimization",
        "competitors": [
          "Google OR-Tools",
          "PyVRP",
          "VROOM",
          "GraphHopper",
          "Onfleet"
        ],
        "competitor_strength": "They have mature solvers, routing APIs, dispatch UX, and production integrations.",
        "catalyst_q_angle": "Sell baseline-vs-Catalyst ROI proof: replay current routes, compare public solver baselines, quantify miles, lateness, capacity, route stability, fuel, and emissions proxy.",
        "how_to_crush": [
          "Finish OR-Tools/PyVRP/VROOM benchmark evidence and publish the comparison packet.",
          "Make CSV upload and ROI packet generation effortless for ops leaders.",
          "Price around verified savings and fast paid pilots instead of generic route planning seats."
        ],
        "proof_language": "Say Catalyst-Q turns a fleet's current dispatch data into a baseline-vs-optimized ROI packet the CFO can inspect.",
        "proof_to_publish_next": "Freight proof packet: customer baseline vs OR-Tools/PyVRP/VROOM vs Catalyst-Q with savings math."
      },
      {
        "segment": "Grid, flight, and ATC incumbents",
        "competitors": [
          "GE Vernova GridOS",
          "Siemens Spectrum Power ADMS",
          "Boeing Jeppesen",
          "Thales TopSky",
          "Frequentis"
        ],
        "competitor_strength": "They own regulated workflows, integrations, procurement trust, and safety/compliance posture.",
        "catalyst_q_angle": "Start as advisory proof and simulator/offline replay: verify scenarios, rank options, expose assumptions, and preserve human approval.",
        "how_to_crush": [
          "Lead with offline replay and advisory workflows while building benchmark credibility.",
          "Publish PGLib/MATPOWER, OpenSky/BlueSky, and simulator replay packets.",
          "Sell to innovation teams as a decision-evidence layer that complements installed systems."
        ],
        "proof_language": "Say Catalyst-Q gives regulated teams ranked options, transparent assumptions, and replayable evidence before operational approval.",
        "proof_to_publish_next": "Offline replay packet showing operator-approved alternatives and explicit no-control-action boundaries."
      }
    ],
    "rows": [
      {
        "vertical_agent_id": "exact-chemistry-verification",
        "label": "Exact Chemistry Verification",
        "current_signal": "Self-verifying active-space packet now returns a SHA-256 consistency proof, replay id, declared active space, DFT comparison, small-subsystem checks, and a buyer-ready evidence scope.",
        "competitor_baselines": [
          {
            "name": "PySCF",
            "category": "open_solver",
            "url": "https://pyscf.org/",
            "comparison_use": "Open quantum-chemistry reference for molecular integrals, active-space workflows, and exact-diagonalization comparisons on small systems."
          },
          {
            "name": "OpenFermion",
            "category": "open_solver",
            "url": "https://quantumai.google/openfermion",
            "comparison_use": "Reference toolchain for fermionic Hamiltonian construction and quantum chemistry problem encodings."
          },
          {
            "name": "CuspAI / AI materials-discovery platforms",
            "category": "commercial_platform",
            "url": "https://www.cusp.ai/",
            "comparison_use": "AI science/materials companies generate candidates; Catalyst-Q should position as the verification oracle for disputed physics."
          },
          {
            "name": "Cajal Technologies",
            "category": "commercial_platform",
            "url": "https://www.ycombinator.com/companies/cajal-technologies",
            "comparison_use": "Formal-verification-agent trend reference; Catalyst-Q should emulate the trust posture for quantum simulation outputs."
          }
        ],
        "where_we_win": [
          "A premium verification packet is easier to monetize than generic self-serve quantum compute.",
          "packet.verify(), .rain replay, and small-subsystem checks directly address black-box skepticism.",
          "The wedge complements AI materials-discovery companies instead of competing with their candidate-generation engines."
        ],
        "where_we_are_behind": [
          "Current packet is a consistency-proof product surface, not yet a published chemistry accuracy benchmark suite.",
          "External PySCF/OpenFermion/DMRG/FCIQMC reference campaigns are needed before strong accuracy claims.",
          "Enterprise chemistry buyers will require scientist review, data-security review, and careful active-space scoping."
        ],
        "next_benchmark_gates": [
          "Run small active-space molecules and transition-metal fragments against PySCF/OpenFermion references.",
          "Record symmetry, charge, spin, and small-subsystem replay checks for every packet.",
          "Publish the chemistry evidence policy that defines the exact active-space scope, references, review role, and promotion gates."
        ],
        "claim_boundary": "Evidence scope: scoped active-space verification packets with declared assumptions, replay ids, reference comparisons, and scientist review.",
        "commercial_posture": "verification_beachhead"
      },
      {
        "vertical_agent_id": "freight-field-routing",
        "label": "Freight & Field RouteOps",
        "current_signal": "Local synthetic CVRPTW-shaped eval passes with 39.9% objective improvement versus nearest-neighbor baseline, no capacity violation, no lateness, and minimal-change replan passing.",
        "competitor_baselines": [
          {
            "name": "Google OR-Tools",
            "category": "open_solver",
            "url": "https://developers.google.com/optimization/routing/vrp",
            "comparison_use": "Pinned public VRP baseline for capacity, time-window, pickup-delivery, and routing-constraint feasibility."
          },
          {
            "name": "PyVRP",
            "category": "open_solver",
            "url": "https://pyvrp.readthedocs.io/en/stable/",
            "comparison_use": "Strong open-source VRP solver baseline for CVRPLIB and Solomon-style instances."
          },
          {
            "name": "VROOM",
            "category": "open_solver",
            "url": "https://vroom-project.org/",
            "comparison_use": "Fast open-source vehicle-routing optimization engine baseline for operational routing APIs."
          },
          {
            "name": "GraphHopper Route Optimization API",
            "category": "commercial_platform",
            "url": "https://docs.graphhopper.com/openapi/routing",
            "comparison_use": "Commercial routing API reference for routing constraints, integrations, and route optimization user expectations."
          }
        ],
        "where_we_win": [
          "Fastest path to a paid ROI pilot because the buyer can measure miles, lateness, vehicles, fuel proxy, and dispatcher workload.",
          ".rain replay and Catalyst Brain memory give the buyer a repeatable proof trail instead of only a black-box route answer.",
          "Cloudflare Browser Run and Pipelines can capture authorized portal evidence and live route/run telemetry for finance-grade ROI packets."
        ],
        "where_we_are_behind": [
          "OR-Tools, PyVRP, and VROOM are live in the Cloudflare benchmark runner; the current production mini fixture uses them as ensemble seeds rather than a best-in-class superiority claim.",
          "The current 39.9% synthetic improvement and live 142.27 open-solver objective need larger CVRPLIB/Solomon evidence before broad solver-superiority language.",
          "Production competitors have mature TMS, driver-app, mapping, and dispatch integrations."
        ],
        "next_benchmark_gates": [
          "Use the Worker-native WASM benchmark fallback for no-Docker local smoke only; do not treat it as external solver evidence.",
          "Use the live Cloudflare benchmark runner OR-Tools/PyVRP/VROOM rows as ensemble seeds for freight-routeops-mini-v1, then require Catalyst-Q to beat, match, or explain gaps before dispatcher review.",
          "Run Solomon VRPTW and CVRPLIB cases with normalized distance, lateness, capacity, vehicle, and minimal-change metrics.",
          "Gate marketing language on feasible routes within an agreed objective gap against the best feasible public baseline."
        ],
        "claim_boundary": "Evidence scope: paid-pilot ROI proof with pinned OR-Tools, PyVRP, VROOM, and customer-baseline comparisons before expansion language.",
        "commercial_posture": "paid_pilot_ready"
      },
      {
        "vertical_agent_id": "flight-ops-routing",
        "label": "Flight Ops Route Intelligence",
        "current_signal": "Aviation decision-support eval passes with synthetic OpenSky/BlueSky-shaped data, ranked resolution options, fuel/delay proxy, and no autonomous clearance language.",
        "competitor_baselines": [
          {
            "name": "NAVBLUE N-Flight Planning",
            "category": "commercial_platform",
            "url": "https://www.navblue.aero/product/n-flight-planning/",
            "comparison_use": "Incumbent flight-planning benchmark for dispatcher workflows, weather/NOTAM integration, and operational scale."
          },
          {
            "name": "Boeing Jeppesen flight planning",
            "category": "commercial_platform",
            "url": "https://ww2.jeppesen.com/aviation-products/",
            "comparison_use": "Reference for certified aviation planning expectations and airline operations integrations."
          },
          {
            "name": "Lufthansa Systems Lido Flight 4D",
            "category": "commercial_platform",
            "url": "https://www.lhsystems.com/solutions/flight-operations/lido-flight-4d",
            "comparison_use": "Reference for flight-plan optimization, airline dispatch integration, and regulated aviation buyer expectations."
          },
          {
            "name": "OpenSky Network",
            "category": "public_benchmark",
            "url": "https://opensky-network.org/data/",
            "comparison_use": "Public aviation-state data source for offline scenario replay and non-operational benchmark fixtures."
          }
        ],
        "where_we_win": [
          "Strong advisory and audit posture: every recommendation remains dispatcher/controller reviewed.",
          "Good fit for offline what-if analysis where route, weather, fuel, and delay tradeoffs need to be explained.",
          "Shared Catalyst-Q scenario packets can reuse freight ROI proof and replay infrastructure."
        ],
        "where_we_are_behind": [
          "Incumbents are certified, integrated, and trusted in operational dispatch environments.",
          "Current eval is synthetic and does not use real airline flight-plan archives.",
          "No weather, NOTAM, overflight-fee, aircraft-performance, or dispatch-system adapter is production-ready yet."
        ],
        "next_benchmark_gates": [
          "Add OpenSky-shaped historical replay fixtures and BlueSky simulation scenarios.",
          "Score fuel/time/risk deltas against dispatcher-approved baselines, not only synthetic proxy scenarios.",
          "Keep outputs as offline analysis until legal/safety review approves a narrower operational scope."
        ],
        "claim_boundary": "Evidence scope: offline analysis and dispatcher-reviewed advisory for flight-route what-if planning, with operator approval preserved.",
        "commercial_posture": "offline_analysis_pilot"
      },
      {
        "vertical_agent_id": "grid-optimization",
        "label": "GridOps Dispatch Intelligence",
        "current_signal": "Mini OPF-shaped eval passes with full 220 MW served, zero unserved load, zero renewable curtailment, zero line/voltage violations, and required operator approval; the PGLib/MATPOWER live lane parses selected public cases up to 2,000 buses, runs the pinned catalyst-dcopf-cg-v1 DC power-flow runner, and includes the bounded catalyst-dcopf-cut-v1 line-constrained redispatch gate.",
        "competitor_baselines": [
          {
            "name": "PGLib-OPF",
            "category": "public_benchmark",
            "url": "https://github.com/power-grid-lib/pglib-opf",
            "comparison_use": "Public AC optimal-power-flow cases for MATPOWER-compatible benchmark gates."
          },
          {
            "name": "MATPOWER",
            "category": "public_benchmark",
            "url": "https://matpower.org/",
            "comparison_use": "Power-system simulation and OPF reference for case parsing, dispatch feasibility, and solver comparisons."
          },
          {
            "name": "GE Vernova GridOS ADMS",
            "category": "commercial_platform",
            "url": "https://www.gevernova.com/software/products/gridos/advanced-distribution-management-system",
            "comparison_use": "Incumbent ADMS/DERMS platform reference for utility integration and control-room expectations."
          },
          {
            "name": "Siemens Spectrum Power ADMS",
            "category": "commercial_platform",
            "url": "https://www.siemens.com/en-us/products/gridscale-x/advanced-distribution-management/",
            "comparison_use": "Incumbent grid operations reference for ADMS capabilities, reliability posture, and utility procurement standards."
          }
        ],
        "where_we_win": [
          "Clear advisory-plus-evidence wedge for utilities that need replayable dispatch, contingency, and curtailment analysis.",
          "Catalyst-Q can package scenario exploration and approval gates without trying to control SCADA directly.",
          "Strong ROI narrative around curtailment, congestion, operating cost, and avoided violations."
        ],
        "where_we_are_behind": [
          "Current PGLib/MATPOWER proof is selected-case DC screening, not full DCOPF or ACOPF optimization superiority.",
          "Incumbents own ADMS/DERMS/SCADA integrations, utility support processes, and compliance workflows.",
          "No production state-estimation, telemetry-quality, N-1 security-constrained OPF, or utility data adapter is complete yet."
        ],
        "next_benchmark_gates": [
          "Expand eval:grid:pglib across larger PGLib-OPF cases and scheduled benchmark runs.",
          "Scale the line-constrained redispatch gate into full larger-case DCOPF, then add ACOPF validation for voltage, reactive power, and nonlinear feasibility.",
          "Require operator-approval and no-control-action language in every grid prompt and API output."
        ],
        "claim_boundary": "Evidence scope: advisory dispatch intelligence with public OPF evidence, operator approval, and integration readiness gates.",
        "commercial_posture": "advisory_proof"
      },
      {
        "vertical_agent_id": "atc-decision-support",
        "label": "ATC Simulator Decision Support",
        "current_signal": "Aviation safety eval now passes with 2 detected separation conflicts, 2 ranked resolution-option packets, 3 precise human handoffs, fuel/delay proxy, and no autonomous clearance.",
        "competitor_baselines": [
          {
            "name": "BlueSky Open Air Traffic Simulator",
            "category": "simulator",
            "url": "https://github.com/TUDelft-CNS-ATM/bluesky",
            "comparison_use": "Open simulator loop for replayable conflict-detection and resolution-option benchmark scenarios."
          },
          {
            "name": "Adacel MaxSim",
            "category": "commercial_platform",
            "url": "https://www.adacel.com/maxsim-air-traffic-control-simulation-training/",
            "comparison_use": "Commercial ATC simulator/training reference for buyer expectations and scenario-training workflows."
          },
          {
            "name": "Thales TopSky ATC",
            "category": "commercial_platform",
            "url": "https://www.thalesgroup.com/en/solutions-catalogue/civil-aviation/airspace-management/topsky-atc",
            "comparison_use": "Operational ATC platform reference for why Catalyst-Q must stay simulator/training first."
          },
          {
            "name": "Frequentis OneATM",
            "category": "commercial_platform",
            "url": "https://www.frequentis.com/en/air-traffic-management",
            "comparison_use": "ATM automation and communications incumbent reference for safety-case and integration expectations."
          }
        ],
        "where_we_win": [
          "Clear safety boundary: ranked options, false-positive discipline, and human authority are explicit in the eval contract.",
          "Compelling training/simulator add-on if Catalyst-Q can generate replayable scenarios and explain missed/false conflict behavior.",
          ".rain replay can become a strong safety-case artifact for simulation and research programs."
        ],
        "where_we_are_behind": [
          "Not suitable for live ATC operations.",
          "No certified integration with operational ATM systems or controller workstations.",
          "Needs BlueSky-style scenario replay, workload scoring, and false-positive/false-negative measurement at scale."
        ],
        "next_benchmark_gates": [
          "Add BlueSky scenario replay fixtures for conflict detection and resolution-option quality.",
          "Measure missed conflicts, false positives, workload proxy, fuel/delay proxy, and human handoff precision.",
          "Keep all commercial language anchored to simulator/training and offline what-if analysis."
        ],
        "claim_boundary": "Evidence scope: simulator training and offline decision-support evidence with ranked options, replay, and human authority.",
        "commercial_posture": "simulator_training_wedge"
      }
    ],
    "summary": {
      "vertical_count": 5,
      "paid_pilot_ready_count": 2,
      "recommended_sequence": [
        "freight-field-routing",
        "exact-chemistry-verification",
        "flight-ops-routing",
        "grid-optimization",
        "atc-decision-support"
      ],
      "overall_verdict": "Freight RouteOps is the first PMF wedge because buyers can inspect one route file, one proof packet, and one approval decision quickly; exact chemistry remains the premium second room; flight ops, grid, and ATC should stay advisory/offline until public benchmark and integration gates mature."
    }
  }
}