diff --git a/docs.json b/docs.json index d2cb9edd..84917a08 100644 --- a/docs.json +++ b/docs.json @@ -174,6 +174,36 @@ } ] }, + { + "tab": "CometChat On-Prem", + "dropdowns": [ + { + "dropdown": "Docker", + "icon": "/images/icons/docker.svg", + "pages": [ + "fundamentals/cometchat-on-prem/docker/overview", + "fundamentals/cometchat-on-prem/docker/prerequisites", + "fundamentals/cometchat-on-prem/docker/configuration-reference", + "fundamentals/cometchat-on-prem/docker/quick-start", + "fundamentals/cometchat-on-prem/docker/production-deployment", + "fundamentals/cometchat-on-prem/docker/monitoring", + "fundamentals/cometchat-on-prem/docker/security", + "fundamentals/cometchat-on-prem/docker/persistence-and-backup", + "fundamentals/cometchat-on-prem/docker/scaling", + "fundamentals/cometchat-on-prem/docker/upgrades", + "fundamentals/cometchat-on-prem/docker/troubleshooting", + "fundamentals/cometchat-on-prem/docker/air-gapped-deployment" + ] + }, + { + "dropdown": "Kubernetes", + "icon": "/images/icons/kubernetes.svg", + "pages": [ + "fundamentals/cometchat-on-prem/kubernetes/overview" + ] + } + ] + }, { "tab": "Widget Builder", "dropdowns": [ @@ -837,7 +867,7 @@ "icon": "/images/icons/react.svg", "versions": [ { - "version": "v5\u200e", + "version": "v5‎", "groups": [ { "group": " ", @@ -924,7 +954,7 @@ ] }, { - "version": "v4\u200e", + "version": "v4‎", "groups": [ { "group": " ", @@ -1087,7 +1117,7 @@ ] }, { - "version": "v3\u200e", + "version": "v3‎", "groups": [ { "group": " ", @@ -1109,7 +1139,7 @@ ] }, { - "version": "v2\u200e", + "version": "v2‎", "groups": [ { "group": " ", @@ -1137,7 +1167,7 @@ "icon": "/images/icons/swift.svg", "versions": [ { - "version": "v5\u200e\u200e", + "version": "v5‎‎", "groups": [ { "group": " ", @@ -1242,7 +1272,7 @@ ] }, { - "version": "v4\u200e\u200e", + "version": "v4‎‎", "groups": [ { "group": " ", @@ -1406,7 +1436,7 @@ ] }, { - "version": "v3\u200e\u200e", + "version": "v3‎‎", "groups": [ { "group": " ", @@ -1428,7 +1458,7 @@ ] }, { - "version": "v2\u200e\u200e", + "version": "v2‎‎", "groups": [ { "group": " ", @@ -1456,7 +1486,7 @@ "icon": "/images/icons/android.svg", "versions": [ { - "version": "v5\u200e\u200e\u200e", + "version": "v5‎‎‎", "groups": [ { "group": " ", @@ -1559,7 +1589,7 @@ ] }, { - "version": "v4\u200e\u200e\u200e", + "version": "v4‎‎‎", "groups": [ { "group": " ", @@ -1716,7 +1746,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e", + "version": "v3‎‎‎", "groups": [ { "group": " ", @@ -1741,7 +1771,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e", + "version": "v2‎‎‎", "groups": [ { "group": " ", @@ -1770,7 +1800,7 @@ "icon": "/images/icons/flutter.svg", "versions": [ { - "version": "v5\u200e\u200e\u200e\u200e", + "version": "v5‎‎‎‎", "groups": [ { "group": " ", @@ -1876,7 +1906,7 @@ ] }, { - "version": "v4\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎", "groups": [ { "group": " ", @@ -2052,7 +2082,7 @@ "icon": "/images/icons/angular.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎", "groups": [ { "group": " ", @@ -2242,7 +2272,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎", "groups": [ { "group": " ", @@ -2263,7 +2293,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎", "groups": [ { "group": " ", @@ -2290,7 +2320,7 @@ "icon": "/images/icons/vuejs.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2383,7 +2413,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2405,7 +2435,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2437,7 +2467,7 @@ "icon": "/images/icons/js.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2570,7 +2600,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2693,7 +2723,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2796,7 +2826,7 @@ "icon": "/images/icons/react.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -2911,7 +2941,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3025,7 +3055,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3127,7 +3157,7 @@ "icon": "/images/icons/swift.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3250,7 +3280,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3368,7 +3398,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3475,7 +3505,7 @@ "icon": "/images/icons/android.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3590,7 +3620,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3710,7 +3740,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3813,7 +3843,7 @@ "icon": "/images/icons/flutter.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -3927,7 +3957,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -4032,7 +4062,7 @@ "icon": "/images/icons/ionic.svg", "versions": [ { - "version": "v4\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v4‎‎‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -4140,7 +4170,7 @@ ] }, { - "version": "v3\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v3‎‎‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -4250,7 +4280,7 @@ ] }, { - "version": "v2\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e\u200e", + "version": "v2‎‎‎‎‎‎‎‎‎‎‎‎", "groups": [ { "group": " ", @@ -5005,20 +5035,20 @@ ] }, { - "tab": "Custom Bots", + "tab": "Custom Bots", "hidden": true, "pages": [ "/ai-chatbots/custom-bots" ] }, { - "tab": "AI Bots (Legacy)", + "tab": "AI Bots (Legacy)", "hidden": true, "pages": [ - "/ai-chatbots/ai-bots/overview", - "/ai-chatbots/ai-bots/instructions", - "/ai-chatbots/ai-bots/bots" - ] + "/ai-chatbots/ai-bots/overview", + "/ai-chatbots/ai-bots/instructions", + "/ai-chatbots/ai-bots/bots" + ] } ] }, @@ -5716,7 +5746,7 @@ { "source": "/ai-agents/tools", "destination": "/ai-agents/mastra-tools" - }, + }, { "source": "/ai-chatbots/overview", "destination": "/ai-chatbots/ai-bots/overview" @@ -5740,7 +5770,7 @@ "metatags": { "charset": "UTF-8", "viewport": "width=device-width, initial-scale=1.0", - "description": "Learn how to integrate, customize, and scale real-time chat using CometChat\u2019s UI Kits, SDKs, and widgets across popular frameworks. Get step-by-step guides, best practices, and implementation details to build production-ready chat experiences.", + "description": "Learn how to integrate, customize, and scale real-time chat using CometChat’s UI Kits, SDKs, and widgets across popular frameworks. Get step-by-step guides, best practices, and implementation details to build production-ready chat experiences.", "language": "en" } }, @@ -5749,4 +5779,4 @@ "redirect": true } } -} +} \ No newline at end of file diff --git a/fundamentals/cometchat-on-prem/docker/air-gapped-deployment.mdx b/fundamentals/cometchat-on-prem/docker/air-gapped-deployment.mdx new file mode 100644 index 00000000..62d47786 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/air-gapped-deployment.mdx @@ -0,0 +1,23 @@ +--- +title: "Air-Gapped Deployment" +sidebarTitle: "Air-Gapped" +--- + +Guidelines for deploying the platform in offline or isolated (air-gapped) environments. + +## Offline installation steps + +- Export required Docker images with `docker save` +- Transfer images via removable media, secure copy (SSH), or an isolated internal network +- Import images on the target system with `docker load` + +## Local registry + +- Host images in Harbor, Nexus, or a private Docker registry +- Enforce role-based access control (RBAC) and image retention policies + +## Limitations in air-gapped mode + +- No access to external push notification services +- No S3 or other cloud object storage unless internally emulated +- No cloud-hosted analytics, logging, or monitoring integrations diff --git a/fundamentals/cometchat-on-prem/docker/configuration-reference.mdx b/fundamentals/cometchat-on-prem/docker/configuration-reference.mdx new file mode 100644 index 00000000..d43aebf9 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/configuration-reference.mdx @@ -0,0 +1,120 @@ +--- +title: "Configuration Reference" +sidebarTitle: "Configuration" +--- + +Use this reference when updating domains, migrating environments, troubleshooting misconfiguration, or performing production deployments. Values are sourced from `docker-compose.yml`, service-level `.env` files, and the domain update guide. + +Use this when: +- Updating domains +- Migrating environments +- Troubleshooting service misconfiguration +- Performing production deployments + +## Global notes + +- All services read environment variables from their respective directories. +- Domain values must be updated consistently across API, WebSocket, Notifications, Webhooks, and NGINX configurations. +- Changing the primary domain impacts reverse proxy routing, OAuth headers, CORS, webhook endpoints, and TiDB host references. + +## Chat API + +Update these values when changing domains: + +- `MAIN_DOMAIN=""` +- `EXTENSION_DOMAIN=""` +- `WEBHOOKS_BASE_URL="https://webhooks./v1/webhooks"` +- `TRIGGERS_BASE_URL="https://webhooks./v1/triggers"` +- `EXTENSION_BASE_URL="https://notifications."` +- `MODERATION_ENABLED=true` +- `RULES_BASE_URL="https://moderation./v1/moderation-service"` +- `ADMIN_API_HOST="api."` +- `CLIENT_API_HOST="apiclient."` +- `ALLOWED_API_DOMAINS=","` +- `DB_HOST="tidb."` +- `DB_HOST_CREATOR="tidb."` +- `V3_CHAT_HOST="websocket."` + +## Management API (MGMT API) + +- `ADMIN_API_HOST="api."` +- `CLIENT_API_HOST="apiclient."` +- `APP_HOST="dashboard."` +- `API_HOST="https://mgmt-api."` +- `MGMT_DOMAIN=""` +- `MGMT_DOMAIN_TO_REPLACE=""` +- `RULES_BASE_URL="https://moderation./v1/moderation"` +- `ACCESS_CONTROL_ALLOW_ORIGIN=","` + +## WebSocket + +Hostnames are derived automatically from NGINX and Chat API configuration; no manual domain updates are required. + +## Notifications service + +- `CC_DOMAIN=""` (controls routing, token validation, and push delivery) + +## Moderation service + +- `CHAT_API_URL=""` for rule evaluation, metadata retrieval, and decision submission + +## Webhooks service + +- `CHAT_API_DOMAIN=""` - must match the Chat API domain exactly to avoid retries or signature verification failures + +## Extensions + +```json +"DOMAINS": [ + "", + "", + "" +], +"DOMAIN_NAME": "" +``` + +Defines CORS and allowed origins for extension traffic. + +## Receipt Updater + +- `RECEIPTS_MYSQL_HOST="tidb."` for delivery receipts, read receipts, and thread metadata + +## SQL Consumer + +```json +"CONNECTION_CONFIG": { + "host": "" +}, +"ALTER_USER_CONFIG": { + "host": "" +}, +"API_CONFIG": { + "API_DOMAIN": "" +} +``` + +Controls database migrations, multi-tenant provisioning, and internal requests to Chat API. + +## NGINX configuration files + +Update domain values in: + +- chatapi.conf +- extensions.conf +- mgmtapi.conf +- notifications.conf +- dashboard.conf +- globalwebhooks.conf +- moderation.conf +- websocket.conf + +These govern TLS termination, routing, reverse proxy rules, and WebSocket upgrades. + +## Summary of domain values to update + +- Chat API, Client API, and Management API +- Notifications, Moderation, Webhooks, and Extensions services +- NGINX reverse proxy hostnames +- TiDB host references +- WebSocket host configuration in Chat API + diff --git a/fundamentals/cometchat-on-prem/docker/monitoring.mdx b/fundamentals/cometchat-on-prem/docker/monitoring.mdx new file mode 100644 index 00000000..bc321d64 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/monitoring.mdx @@ -0,0 +1,175 @@ +--- +title: "Monitoring" +sidebarTitle: "Monitoring" +--- + +Monitoring ensures system health, operational visibility, and SLA compliance for CometChat On-Prem deployments. + +## Monitoring stack + +The following open-source tools form the monitoring and observability stack for CometChat On-Prem deployments: + +- **Prometheus**: Collects and stores metrics from all services +- **Grafana**: Visualizes metrics with dashboards and alerts +- **Loki**: Stores and queries logs from all containers +- **Promtail**: Tails logs from Docker containers and pushes them to Loki +- **Node Exporter**: Collects host-level metrics (CPU, memory, disk, network) +- **cAdvisor**: Collects container-level resource usage metrics + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Grafana │ +│ (Dashboards & Visualization) │ +└──────────────┬─────────────────────────┬────────────────────┘ + │ │ + │ Queries │ Queries + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ Prometheus │ │ Loki │ + │ (Metrics Store) │ │ (Log Store) │ + └────────┬─────────┘ └────────┬─────────┘ + │ │ + │ Scrapes (/metrics) │ Pushes + ▼ ▼ + ┌─────────────────────────────────────────┐ + │ Node Exporter │ cAdvisor │ Promtail │ + │ (Host Metrics) │ (Container)│ (Logs) │ + └─────────────────────────────────────────┘ + │ │ │ + └────────────────┴──────────────┘ + │ + ┌───────▼────────┐ + │ Docker Swarm │ + │ CometChat │ + │ Services │ + └────────────────┘ +``` + +## Key metrics to monitor + +### Infrastructure +- CPU usage per node +- Memory usage per node +- Disk space and I/O +- Network traffic +- Container resource usage + +### Application services +- WebSocket active connections +- Chat API request rate and latency +- API error rates (4xx, 5xx) +- Service uptime + +### Data stores +- **Kafka**: Consumer lag, message throughput +- **Redis**: Memory usage, cache hit ratio, connected clients +- **MongoDB**: Operation latency, connections, replication lag +- **TiDB**: Query duration, region health, storage capacity + +### Load balancer +- NGINX request rate +- Response status codes +- Active connections + +## Alerting + +Alerts should focus on user impact, capacity risks, and data integrity rather than raw metric noise. + +Set up alerts for these critical conditions: + +- CPU usage > 80% for 5 minutes +- Memory usage > 85% for 5 minutes +- Disk space < 15% +- Service down for 2 minutes +- Database query latency > 100ms +- Kafka consumer lag > 10,000 messages +- Redis memory > 90% +- WebSocket connection errors > 10/second +- API error rate > 5% +- Container restarts + +These thresholds are recommended starting points and should be adjusted based on workload characteristics and environment scale. + +## Grafana dashboards + +Create dashboards to visualize: + +1. **Overview**: System health, active users, request rates, error rates +2. **Infrastructure**: CPU, memory, disk, network per node +3. **WebSocket**: Active connections, message throughput, errors +4. **API**: Request rate, latency, error rates by endpoint +5. **Databases**: Query performance, connections, replication status +6. **Kafka**: Consumer lag, throughput, partition health +7. **Logs & Error Analysis**: Error aggregation, log volume, search, and correlation with metrics + +### Logs & Error Analysis Dashboard + +This dashboard provides centralized visibility into application errors, log patterns, and system anomalies for rapid troubleshooting and incident investigation. + +**Key Visualizations:** + +- **Error Volume by Service**: Time-series graph showing error log count per service, helping identify which components are experiencing issues +- **Top Error Messages**: Table displaying the most frequent error messages with occurrence counts, enabling quick identification of recurring problems +- **Log Volume Trends**: Track total log volume over time to detect unusual spikes that may indicate issues or attacks +- **Error Rate by Severity**: Breakdown of errors by severity level (CRITICAL, ERROR, WARNING) for prioritization +- **Service Health Correlation**: Side-by-side view of error logs and service metrics (CPU, memory, latency) to correlate errors with resource constraints +- **Search & Filter**: Interactive LogQL query panel for ad-hoc log searches and pattern matching +- **Recent Critical Errors**: Live feed of the latest critical errors across all services for immediate awareness + +**Use Cases:** +- Rapid incident investigation by correlating errors with metric anomalies +- Identifying error patterns and root causes across distributed services +- Monitoring error trends to detect degradation before user impact +- Post-incident analysis and root cause identification +- Compliance and audit trail review + +## Log queries + +Use Loki's LogQL to search and filter logs across all services: + +```logql +# View all errors +{service="chat-api"} |= "error" + +# WebSocket connection issues +{service="websocket"} |~ "connection.*failed" + +# API 5xx errors +{service="nginx"} |~ "HTTP/[0-9.]+ 5[0-9]{2}" + +# High latency requests +{service="chat-api"} | json | latency > 1000 +``` + +## Troubleshooting + +### First check Grafana dashboards + +Start with the Overview dashboard to determine blast radius before drilling into component-level dashboards. Confirm whether the issue is node-level, service-level, or data-store related before diving into individual components. + +### Check Prometheus targets +```bash +curl http://localhost:9090/api/v1/targets +``` + +### Check Loki status +```bash +curl http://localhost:3100/ready +``` + +### View Promtail logs +```bash +docker service logs promtail +``` + +### Check service metrics +```bash +# Node Exporter +curl http://localhost:9100/metrics + +# cAdvisor +curl http://localhost:8080/metrics +``` + diff --git a/fundamentals/cometchat-on-prem/docker/overview.mdx b/fundamentals/cometchat-on-prem/docker/overview.mdx new file mode 100644 index 00000000..0967ece6 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/overview.mdx @@ -0,0 +1,221 @@ +--- +title: "CometChat On-Prem Overview" +sidebarTitle: "Overview" +--- + +CometChat On-Prem delivers an enterprise-grade, self-hosted real-time messaging platform engineered for mission-critical applications requiring complete data sovereignty, regulatory compliance, and predictable performance at scale. Built on battle-tested open-source technologies and cloud-native principles, this deployment architecture supports workloads from 10,000 to 250,000+ monthly active users with linear scalability and sub-100ms message latency. + +**This guide covers Docker Swarm deployments.** For Kubernetes deployments (recommended for 200k+ MAU or multi-region requirements), please [contact our enterprise solutions team](../kubernetes/overview). + +**Enterprise Value Proposition** +- **Data Sovereignty**: Complete control over data residency, encryption, and access policies to meet GDPR, HIPAA, SOC 2, and industry-specific compliance requirements +- **Predictable Economics**: Eliminate per-user SaaS pricing with fixed infrastructure costs and transparent capacity planning +- **Operational Control**: Full visibility into system behavior, customizable monitoring, and direct access to all components for troubleshooting and optimization +- **Security Posture**: Deploy within your existing security perimeter with private networks, custom authentication integration, and air-gapped deployment options +- **Performance Guarantees**: Achieve consistent sub-100ms latency with dedicated resources and optimized data locality + +## Who this guide is for + +This comprehensive deployment guide is designed for technical decision-makers and implementation teams responsible for enterprise infrastructure: + +- **DevOps & SRE Teams**: Operations professionals managing production uptime, incident response, capacity planning, and continuous deployment pipelines +- **Platform & Backend Engineers**: Technical leads architecting scalable systems, optimizing performance, and integrating real-time messaging into existing application ecosystems +- **Infrastructure Architects**: Strategic planners designing multi-region deployments, disaster recovery strategies, compliance frameworks, and long-term scalability roadmaps +- **Security & Compliance Officers**: Stakeholders ensuring data protection, access controls, audit logging, and regulatory adherence across all system components + +## Platform capabilities + +CometChat On-Prem provides a comprehensive suite of real-time communication features designed for enterprise applications: + +**Core Messaging Infrastructure** +- **1:1 and Group Conversations**: Scalable messaging architecture supporting unlimited conversation threads with persistent message history, rich media attachments, and message threading +- **Real-time Event Streaming**: WebSocket-based bi-directional communication delivering instant presence updates, typing indicators, delivery receipts, read receipts, and custom event propagation +- **Message Delivery Guarantees**: Effectively-once message delivery semantics using idempotent producers, automatic retry logic, offline message queuing, and synchronization across multiple devices + +**Enterprise Features** +- **Distributed Event Pipeline**: Apache Kafka-powered event backbone enabling decoupled microservices architecture, guaranteed message ordering, and fault-tolerant event processing at scale +- **Push Notifications**: Multi-provider notification delivery supporting Firebase Cloud Messaging (FCM), Apple Push Notification Service (APNs), and custom webhook integrations with intelligent batching and delivery optimization +- **Content Moderation**: Configurable policy engine with real-time profanity filtering, spam detection, image moderation, and extensible AI/ML adapter framework for custom moderation workflows +- **Webhooks & Integrations**: Reliable outbound event delivery system with configurable retry policies, HMAC signature validation, and comprehensive audit trails for third-party system integration + +**API & Developer Experience** +- **RESTful APIs**: Comprehensive HTTP APIs for user management, conversation operations, group administration, metadata queries, and administrative functions with OpenAPI documentation +- **Horizontal Scalability**: Stateless service design enabling linear scaling by adding compute resources without architectural changes or data migration +- **Multi-tenancy Support**: Logical isolation of customer data with tenant-specific configurations, rate limits, and resource quotas + +## Data architecture & storage + +The platform employs a polyglot persistence strategy, selecting optimal storage technologies for specific data access patterns and consistency requirements: + +**Primary Data Stores** +- **TiDB Cluster**: Horizontally scalable, MySQL-compatible distributed SQL database providing ACID transactions, automatic sharding, and multi-region replication. Composed of three components: + - **Placement Driver (PD)**: Cluster metadata management and intelligent data placement + - **TiKV**: Distributed key-value storage engine with Raft consensus for strong consistency + - **TiDB SQL Layer**: MySQL-compatible query interface with distributed transaction coordination + - **Use Cases**: User profiles, conversation metadata, group memberships, message indices, and relational data requiring strong consistency guarantees + +- **MongoDB**: Document-oriented database optimized for flexible schema evolution and semi-structured data with native JSON support and rich query capabilities + - **Use Cases**: Moderation policies, user preferences, custom metadata fields, audit logs, and data requiring schema flexibility + +- **Redis Clusters**: Three dedicated in-memory data structure stores providing sub-millisecond latency for high-frequency operations: + - **Cache Cluster**: Application-level caching, query result caching, and frequently accessed data + - **Pub/Sub Cluster**: Real-time message broadcasting and event distribution for WebSocket connections + - **Session & Rate Limiting Cluster**: User session management, authentication tokens, and distributed rate limiting counters + +**Event Streaming Platform** +- **Apache Kafka**: Distributed commit log serving as the central event backbone for asynchronous communication between microservices + - **Use Cases**: Real-time message delivery, event sourcing, audit trails, analytics pipelines, and inter-service communication + - **Guarantees**: At-least-once delivery, message ordering per partition, configurable retention policies, and horizontal scalability + +**Optional Storage Systems** +- **Object Storage**: S3-compatible storage (Amazon S3, MinIO, Ceph, or Google Cloud Storage) for unstructured data and large binary objects + - **Use Cases**: Media attachments (images, videos, documents), backup archives, log files, and compliance data requiring long-term retention + - **Features**: Lifecycle policies, versioning, encryption at rest, and cost-optimized storage tiers + +**Data Durability & Backup** +- Automated backup strategies with point-in-time recovery capabilities +- Cross-region replication options for disaster recovery scenarios +- Configurable retention policies aligned with compliance requirements + +## Deployment models + +CometChat On-Prem supports multiple deployment architectures to match your operational maturity, scale requirements, and infrastructure preferences: + +### Local Development (Docker Compose) +**Target Environment**: Development workstations, CI/CD pipelines, and QA environments + +**Characteristics**: +- Single-machine deployment with all services containerized +- Simplified dependency management and rapid environment provisioning +- Ideal for feature development, integration testing, and proof-of-concept demonstrations +- **Not recommended for production workloads** due to single point of failure and limited scalability + +**Use Cases**: Local development, automated testing, developer onboarding, and architecture evaluation + +--- + +### Docker Swarm (Recommended: 10k-200k MAU) +**Target Environment**: Production deployments up to ~200,000 monthly active users and ~20,000 peak concurrent connections + +**Characteristics**: +- Lightweight orchestration with native Docker integration and minimal operational overhead +- Predictable service placement with node constraints and resource reservations +- Secure overlay networking with encrypted service-to-service communication +- Rolling updates with configurable health checks and automatic rollback capabilities +- Built-in load balancing and service discovery without external dependencies + +**Enterprise Benefits**: +- Lower operational complexity compared to Kubernetes while maintaining production-grade reliability +- Faster deployment cycles with straightforward configuration management +- Reduced infrastructure costs with efficient resource utilization +- Proven architecture supporting hundreds of production deployments + +**Recommended For**: Mid-market enterprises, SaaS platforms, healthcare applications, financial services, and organizations prioritizing operational simplicity + +--- + +### Kubernetes (Enterprise & Multi-Region) +**Target Environment**: Large-scale deployments exceeding 200,000 MAU, multi-region architectures, or strict compliance requirements + +**Characteristics**: +- Advanced autoscaling with Horizontal Pod Autoscaler (HPA) and Vertical Pod Autoscaler (VPA) +- Multi-region active-active deployments with global load balancing +- Service mesh integration (Istio, Linkerd) for mTLS, traffic management, and observability +- Cloud-native Kafka operators (Strimzi, Confluent) for automated cluster management +- GitOps workflows with ArgoCD or Flux for declarative infrastructure management +- Advanced security policies with Pod Security Standards and Network Policies + +**Enterprise Benefits**: +- Unlimited horizontal scalability with automated capacity management +- Cross-region disaster recovery with sub-minute failover times +- Compliance-ready architecture supporting SOC 2, ISO 27001, and industry-specific regulations +- Integration with enterprise identity providers (LDAP, Active Directory, SAML, OAuth) +- Advanced observability with distributed tracing and service dependency mapping + +**Recommended For**: Global enterprises, regulated industries (healthcare, finance, government), multi-tenant SaaS platforms, and organizations with existing Kubernetes expertise + +**Enterprise Support**: Contact our solutions team for Kubernetes reference architectures, migration planning, and ongoing operational guidance tailored to your specific requirements. + +## High-level architecture + +The CometChat On-Prem platform employs a modern, microservices-based architecture designed for enterprise-grade reliability, security, and performance. The system is built on proven open-source technologies and follows cloud-native principles to ensure operational excellence at scale. + +![CometChat On-Prem Architecture](/images/docker-on-prem-architecture.png) + +### Architecture Components + +**Client Layer** +- **Desktop Clients**: Native desktop applications and web browsers accessing the platform via HTTPS and WebSocket protocols +- **Mobile Apps**: iOS and Android applications with persistent connections for real-time messaging and push notification support +- **Cloud Services**: Third-party integrations, webhooks consumers, and external systems interfacing with the platform APIs + +**Load Balancer** +- Enterprise-grade traffic distribution layer providing high availability, SSL/TLS termination, health checking, and automatic failover across Docker Swarm nodes +- Supports session affinity for WebSocket connections and intelligent routing based on service health metrics + +**Docker Swarm Cluster** + +The core platform runs within a Docker Swarm orchestration environment, providing service discovery, load balancing, and automated container management. + +**Backend Services** + +*NGINX Reverse Proxy* +- TLS/SSL termination with configurable cipher suites and certificate management +- HTTP/2 and WebSocket protocol support with automatic upgrade handling +- Request routing to microservices based on URL patterns and headers +- Rate limiting, request buffering, and connection pooling for optimal performance + +*Microservices Layer* +- **WebSocket Gateway**: Maintains persistent bi-directional connections for real-time event delivery, presence management, typing indicators, and instant message routing with automatic reconnection and session recovery +- **Chat API Service**: RESTful API handling message CRUD operations, conversation management, user operations, group administration, and metadata queries with transaction support +- **Moderation Service**: Content filtering engine with configurable policies, profanity detection, spam prevention, image moderation, and AI/ML integration for advanced threat detection +- **Notifications Service**: Asynchronous push notification dispatcher supporting FCM, APNs, and custom providers with intelligent batching, retry logic, and delivery tracking +- **Webhooks Service**: Outbound event delivery system with configurable retry policies, exponential backoff, HMAC signature validation, and comprehensive audit logging + +**Kafka Event Bus** +- Distributed event streaming platform serving as the central message backbone for inter-service communication +- Provides guaranteed message ordering, fault-tolerant persistence, and horizontal scalability +- Enables decoupled microservices architecture with publish-subscribe and event sourcing patterns +- Handles real-time message routing, event notifications, and asynchronous processing pipelines +- **ZooKeeper**: Distributed coordination service used by Kafka for leader election, metadata management, and cluster coordination + +**Data Store Components** + +*TiDB Cluster (Distributed SQL Database)* +- **Placement Driver (PD)**: Cluster metadata management, timestamp allocation, and intelligent data placement decisions +- **TiKV**: Distributed transactional key-value storage engine with Raft consensus protocol ensuring strong consistency and automatic data replication +- **TiDB SQL Layer**: MySQL-compatible query interface with distributed transaction coordination, supporting ACID guarantees and horizontal scalability +- **Use Cases**: User profiles, conversation metadata, message indices, group memberships, and all relational data requiring strong consistency + +*Redis Clusters* +- Three dedicated in-memory data structure stores providing sub-millisecond latency +- **Cache Cluster**: Application-level caching, query result caching, and frequently accessed data +- **Pub/Sub Cluster**: Real-time message broadcasting and event distribution for WebSocket connections +- **Session & Rate Limiting Cluster**: User session management, authentication tokens, and distributed rate limiting counters + +*MongoDB* +- Document-oriented database for flexible schema requirements +- Stores moderation policies, user preferences, custom metadata, webhook configurations, and audit logs +- Provides native JSON support and rich query capabilities for semi-structured data + +**Frontend Service** +- **Frontend Application**: Web-based administrative dashboard and user interface components + +**Monitoring Stack** +- **Prometheus**: Time-series metrics collection system scraping service endpoints, storing performance data, and triggering alerts based on configurable thresholds +- **Grafana**: Visualization platform providing real-time operational dashboards, SLA monitoring, capacity planning insights, and customizable alerting workflows +- **Loki & Promtail**: Centralized log aggregation and querying infrastructure enabling rapid troubleshooting and audit trail analysis +- **Node Exporter & cAdvisor**: Host and container-level metrics collection for infrastructure monitoring and capacity planning + +**Infrastructure Layer** + +*Host Infrastructure* +- Physical or virtual compute resources running Docker Swarm nodes +- Persistent storage volumes for stateful services (databases, Kafka, logs) +- Resource allocation and isolation using Docker resource constraints + +*Private Network* +- Secure overlay network isolating backend services from external access +- Encrypted service-to-service communication using Docker Swarm's built-in encryption +- Network segmentation separating public-facing services from data stores +- Optimized routing paths minimizing inter-service latency and maximizing throughput diff --git a/fundamentals/cometchat-on-prem/docker/persistence-and-backup.mdx b/fundamentals/cometchat-on-prem/docker/persistence-and-backup.mdx new file mode 100644 index 00000000..5600eb83 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/persistence-and-backup.mdx @@ -0,0 +1,234 @@ +--- +title: "Persistence & Backup" +sidebarTitle: "Persistence & Backup" +--- + +Defines how persistent data is stored, backed up, and restored in production environments. Proper backup and disaster recovery procedures are essential for business continuity and data protection. + +**Key Objectives:** +- Protect against data loss from hardware failures, human errors, or disasters +- Enable point-in-time recovery for compliance and operational requirements +- Minimize Recovery Time Objective (RTO) and Recovery Point Objective (RPO) +- Validate backup integrity through regular restore testing + +## Volume layout + +Docker volumes provide persistent storage for stateful services. Each service stores data at specific mount points within containers: + +| Service | Default path | Data stored | +| --- | --- | --- | +| TiKV | `/data` | Distributed key-value data, Raft logs | +| PD | `/data` | Cluster metadata, timestamp oracle | +| Kafka | `/var/lib/kafka/data` | Message logs, topic partitions | +| Redis | `/data` | Cache data, pub/sub state | +| MongoDB | `/data/db` | Document collections, indexes | + +**Storage requirements:** +- All persistent volumes should be backed by SSD or NVMe storage for production deployments +- Provision adequate IOPS for database workloads (minimum 3000 IOPS for TiKV) +- Monitor disk space usage and set alerts at 75% capacity +- Plan for 30-50% growth buffer beyond current usage + +## Backup strategy + +Implement automated, regular backups for all stateful services with appropriate retention policies: + +### TiDB backups + +**Frequency**: Daily full backups, hourly incremental backups (for critical deployments) + +**Method**: Use TiDB BR (Backup & Restore) tool for consistent cluster snapshots + +```bash +# Full backup +tiup br backup full \ + --pd :2379 \ + --storage "local:///backup/$(date +%Y%m%d)" \ + --ratelimit 120 \ + --log-file backup.log + +# Incremental backup +tiup br backup incremental \ + --pd :2379 \ + --storage "local:///backup/incremental/$(date +%Y%m%d_%H%M)" \ + --lastbackupts +``` + +**Storage**: Secure, off-cluster storage (S3, NFS, or dedicated backup server) + +**Retention**: 30 days for daily backups, 90 days for monthly backups (adjust based on compliance requirements) + +### Kafka backups + +**Frequency**: Weekly segment-level backups + +**Method**: Copy Kafka data directories or use MirrorMaker for replication to backup cluster + +```bash +# Stop Kafka broker (if taking offline backup) +docker service scale kafka=0 + +# Backup Kafka data directory +tar -czf kafka-backup-$(date +%Y%m%d).tar.gz /var/lib/kafka/data + +# Restart Kafka broker +docker service scale kafka=1 +``` + +**⚠️ Warning**: Stopping Kafka brokers will interrupt message delivery. Perform offline backups only during maintenance windows. + +**Retention**: 4 weeks (Kafka data is typically transient with configurable retention) + +### MongoDB backups + +**Frequency**: Daily backups + +**Method**: Use mongodump for logical backups or filesystem snapshots for physical backups + +```bash +# Logical backup with mongodump +# Ensure mongodump uses a read-only backup user with minimal privileges +docker exec mongodb mongodump \ + --out /backup/mongodb-$(date +%Y%m%d) \ + --gzip + +# Copy backup to secure storage +docker cp mongodb:/backup/mongodb-$(date +%Y%m%d) /secure/backup/location/ +``` + +**Retention**: 30 days for daily backups, 1 year for monthly backups + +### Redis backups + +**Frequency**: RDB snapshots every 6 hours + +**Method**: Redis automatically creates RDB snapshots based on configuration + +```bash +# Trigger manual snapshot +docker exec redis redis-cli BGSAVE + +# Copy RDB file to backup location +docker cp redis:/data/dump.rdb /backup/redis-$(date +%Y%m%d_%H%M).rdb +``` + +**Note**: Redis data is non-authoritative and can be safely rebuilt from TiDB and Kafka in most scenarios. Backups are primarily for faster recovery rather than data preservation. + +**Retention**: 7 days (cache data has short-term value) + +### Backup validation + +**Monthly restore tests**: Perform full restore to staging environment to verify backup integrity + +```bash +# Example: Restore TiDB backup +tiup br restore full \ + --pd :2379 \ + --storage "local:///backup/20240119" \ + --log-file restore.log + +# Verify data integrity +# Run application smoke tests against restored data +``` + +**Validation checklist**: +- Backup files are complete and not corrupted +- Restore process completes without errors +- Data integrity checks pass (row counts, checksums) +- Application can connect and query restored data +- Restore time meets RTO requirements + +## Disaster recovery + +Establish comprehensive disaster recovery procedures to ensure business continuity in the event of catastrophic failures. + +### Recovery objectives + +Define clear recovery targets based on business requirements: + +- **RTO (Recovery Time Objective)**: Maximum acceptable downtime (typically 1-4 hours for production systems) +- **RPO (Recovery Point Objective)**: Maximum acceptable data loss (typically 1-24 hours depending on backup frequency) + +**Note**: Actual RTO/RPO depends on backup size, network bandwidth, and restore automation maturity. Test and validate your specific recovery times. + +### Disaster recovery procedures + +**Full cluster restoration from backups:** + +1. **Provision new infrastructure** matching production specifications +2. **Restore data stores** in dependency order: + - TiDB/TiKV (primary data store) - Restore TiDB first as it is the authoritative source of user, conversation, and message metadata + - MongoDB (metadata and configuration) + - Kafka (if message history is critical) + - Redis (optional, can rebuild from primary data) + +3. **Restore application services** and verify connectivity +4. **Validate data integrity** through application smoke tests +5. **Update DNS** to point to new cluster +6. **Monitor closely** for 24-48 hours post-recovery + +**Example TiDB restore:** + +```bash +# Restore TiDB cluster from backup +tiup br restore full \ + --pd :2379 \ + --storage "s3://backup-bucket/20240119" \ + --log-file restore.log + +# Verify cluster health +tiup cluster display + +# Run data integrity checks +# Check row counts, run application queries +``` + +### Geographic redundancy + +**Backup storage locations:** +- Maintain a minimum of three geographically isolated backup copies +- Primary backup: Same region as production (fast recovery) +- Secondary backup: Different region (regional disaster protection) +- Tertiary backup: Different cloud provider or on-premises (provider-level disaster protection) + +**Important**: Ensure backups are completed and verified before object storage lifecycle rules expire older snapshots. + +**Replication strategies:** +- Use cloud storage replication (S3 cross-region replication, GCS multi-region) +- Implement backup verification at each location +- Test restore from each backup location quarterly + +### Disaster recovery testing + +**Quarterly DR simulations:** + +Run staged disaster recovery exercises to validate procedures and train teams: + +1. **Warm-standby restoration**: Restore to standby environment, validate without cutting over +2. **Full cluster rehydration**: Complete restore from backups in isolated environment +3. **Failover testing**: Practice DNS cutover and traffic migration procedures +4. **Rollback testing**: Validate ability to roll back to previous state if needed + +**DR drill checklist:** +- [ ] Backup files are accessible from all locations +- [ ] Restore procedures are documented and up-to-date +- [ ] Team members know their roles and responsibilities +- [ ] Communication channels are established +- [ ] Restore time meets RTO requirements +- [ ] Data integrity is validated post-restore +- [ ] Application functionality is verified +- [ ] Lessons learned are documented and procedures updated + +### Backup security + +**Encryption:** +- Encrypt backups at rest using AES-256 or equivalent +- Encrypt backups in transit using TLS +- Store encryption keys separately from backup data (use key management service) + +**Access control:** +- Restrict backup access to authorized personnel only +- Use separate credentials for backup operations +- Audit all backup access and modifications +- Implement multi-factor authentication for backup system access + diff --git a/fundamentals/cometchat-on-prem/docker/prerequisites.mdx b/fundamentals/cometchat-on-prem/docker/prerequisites.mdx new file mode 100644 index 00000000..6331b6a0 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/prerequisites.mdx @@ -0,0 +1,113 @@ +--- +title: "Prerequisites" +sidebarTitle: "Prerequisites" +--- + +This guide outlines the infrastructure, software, and network requirements for deploying CometChat On-Prem. Proper capacity planning ensures optimal performance, cost efficiency, and scalability as your user base grows. + +**Planning Considerations:** +- **Hardware Sizing**: Select appropriate compute and storage resources based on expected monthly active users (MAU) and peak concurrent connections (PCC) +- **Operating System**: Choose enterprise-grade Linux distributions with long-term support +- **Network Architecture**: Plan for secure network segmentation and firewall configurations +- **Storage Strategy**: Account for data retention policies, backup requirements, and growth projections + +## Supported operating systems + +CometChat On-Prem is tested and supported on enterprise Linux distributions with long-term support (LTS) and security updates: + +- **Ubuntu 20.04 / 22.04 / 24.04 LTS**: Recommended for most deployments with 5-year support lifecycle +- **RedHat Enterprise Linux 8+**: Ideal for enterprises requiring commercial support and compliance certifications + +**Why these distributions:** +- Long-term security updates and kernel patches +- Docker and container runtime compatibility +- Enterprise support availability +- Proven stability in production environments + +## Required software + +Ensure these software dependencies are installed before deployment: + +- **Docker Engine >= 24**: Container runtime for service orchestration +- **Docker Compose v2**: Required for local development and non-Swarm workflows +- **Git**: Version control for deployment scripts and configuration management +- **OpenSSL >= 1.1**: TLS/SSL certificate generation and cryptographic operations +- **jq, curl, net-tools**: Command-line utilities for configuration, API testing, and network diagnostics + +**Installation Note**: The quick start guide includes automated installation scripts for Docker and Docker Compose on Ubuntu systems. + +## Minimum hardware (testing / QA) + +For development, testing, and proof-of-concept environments: + +- **8 vCPUs**: Sufficient for running all services with light load +- **16 GB RAM**: Minimum memory for core services and data stores +- **100 GB SSD**: Base storage for application data and logs (scale up based on workload and storage needs) + +**Use Cases**: Local development, CI/CD pipelines, QA testing, architecture evaluation + +**Important**: This configuration is not suitable for production workloads or performance testing at scale. + +## Production hardware + +Production sizing is based on two key metrics: **Monthly Active Users (MAU)** and **Peak Concurrent Connections (PCC)**. Choose the sizing model that matches your expected usage patterns. + +**Important**: The values below represent total cluster capacity and can be distributed across multiple nodes. + +### Baseline sizing + +Recommended for standard usage patterns where peak concurrent connections are approximately 5% of MAU: + +| MAU | Peak concurrent connections (PCC) | vCPUs | RAM | +| --- | --- | --- | --- | +| 10k | 500 | 32 | 64 GiB | +| 25k | 1,250 | 64 | 128 GiB | +| 50k | 2,500 | 96 | 192 GiB | +| 100k | 5,000 | 156 | 312 GiB | +| 200k | 10,000 | 272 | 544 GiB | + +**Storage guidance**: Start at 100 GB SSD and scale to 500 GB to 2 TB SSD depending on workload and data retention policies. + +**Typical use cases**: Consumer messaging apps, community platforms, standard business communication tools + +### High-concurrency sizing + +Recommended for applications with sustained high concurrency where peak connections reach 10% or more of MAU: + +| MAU | Peak concurrent connections (PCC) | vCPUs | RAM | +| --- | --- | --- | --- | +| 10k | 1,000 | 48 | 96 GiB | +| 25k | 2,500 | 96 | 192 GiB | +| 50k | 5,000 | 156 | 312 GiB | +| 100k | 10,000 | 240 | 480 GiB | +| 200k | 20,000 | 480 | 960 GiB | + +**Storage guidance**: Expect to exceed 100 GB SSD; plan 500 GB to 2 TB SSD as concurrency and data volume grow. + +**Typical use cases**: Real-time collaboration tools, customer support platforms, live event applications, gaming communities + +### Storage planning considerations + +**Factors affecting storage requirements:** +- **Message retention**: Longer retention periods require more storage +- **Media attachments**: Images, videos, and files significantly increase storage needs +- **Backup strategy**: Plan for 2-3x storage capacity to accommodate backups +- **Log retention**: Compliance requirements may mandate extended log retention (30-90 days or more) +- **Growth buffer**: Allocate 30-50% additional capacity for unexpected growth + +**Storage performance**: Use SSD or NVMe storage for all production deployments to ensure optimal database and Kafka performance. + +## Required ports + +Configure firewall rules to allow traffic on these ports: + +- **80 / 443**: HTTP and HTTPS traffic to NGINX (public-facing). Port 80 should be used only for HTTP-to-HTTPS redirection. + +**Network security recommendations:** +- Restrict ports 80/443 to known IP ranges when possible +- Use TLS/SSL certificates for all HTTPS traffic +- Keep all backend services (databases, Kafka, Redis) on private networks without public exposure +- Implement rate limiting and DDoS protection at the load balancer level + +**Additional ports for internal communication:** +All inter-service communication occurs on Docker Swarm's private overlay network and does not require external firewall rules. Services communicate securely within the cluster using Docker's encrypted overlay networking. diff --git a/fundamentals/cometchat-on-prem/docker/production-deployment.mdx b/fundamentals/cometchat-on-prem/docker/production-deployment.mdx new file mode 100644 index 00000000..ecbb38e6 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/production-deployment.mdx @@ -0,0 +1,331 @@ +--- +title: "Production Deployment" +sidebarTitle: "Production Deployment" +--- + +This guide walks through deploying CometChat On-Prem to a production Docker Swarm cluster. The deployment process uses automated scripts to ensure consistent, repeatable deployments with proper service orchestration and zero-downtime updates. + +**Deployment Overview:** +- Initialize Docker Swarm cluster for container orchestration +- Deploy all services using infrastructure-as-code approach +- Configure domain mappings and TLS certificates +- Validate deployment health across all components +- Establish operational procedures for updates and maintenance + +**Prerequisites:** +- Infrastructure meeting [hardware requirements](./prerequisites) +- Docker Engine >= 24 installed on all nodes +- Network connectivity between cluster nodes +- Domain names configured and DNS records pointing to your cluster +- TLS/SSL certificates for HTTPS endpoints + +## Initialize Docker Swarm (manager node) + +Docker Swarm provides native clustering and orchestration for Docker containers. Initialize Swarm on your primary manager node: + +```bash +# Initialize Swarm cluster +docker swarm init + +# For hosts with multiple network interfaces, specify the advertise address +docker swarm init --advertise-addr + +# Verify cluster status +docker node ls +``` + +**Expected output**: You should see one node listed with STATUS "Ready" and MANAGER STATUS showing "Leader". + +**Multi-node clusters:** +For high availability, add additional manager and worker nodes: + +```bash +# On manager node: Get join token for additional managers +docker swarm join-token manager + +# On manager node: Get join token for worker nodes +docker swarm join-token worker + +# On additional nodes: Run the join command provided above +docker swarm join --token :2377 +``` + +**High availability recommendations:** +- Deploy 3 or 5 manager nodes (odd numbers for quorum) +- Distribute manager nodes across availability zones when possible +- Use worker nodes for application workloads, managers for orchestration only + +## Deploy the complete infrastructure + +Run the following from the repository root on the Swarm manager node: + +```bash +./deploy.sh +``` + +**What the deployment script does:** +1. Initializes required Docker volumes for persistent data +2. Creates secure overlay networks for service communication +3. Deploys all services defined in `docker-compose.yml` with proper dependencies +4. Starts components in the correct order (data stores → backend services → frontend) +5. Applies health checks and restart policies + +**Deployment time**: Initial deployment typically takes 5-10 minutes depending on image pull speeds and cluster size. + +**Monitoring deployment progress:** + +```bash +# Watch service deployment status +watch docker service ls + +# Check specific service deployment +docker service ps + +# View service logs during deployment +docker service logs -f +``` + +**Deployment validation:** +After deployment completes, verify all services are running: + +```bash +# All services should show replicas as "X/X" (e.g., "1/1" or "3/3") +docker service ls + +# Check for any failed services +docker service ls | grep "0/" +``` + +## Rolling updates + +Perform zero-downtime updates to services using the update script: + +```bash +./update.sh +``` + +**What the update script does:** +- Pulls latest container images from the registry +- Performs rolling updates service by service +- Refreshes configuration and environment variables +- Replaces containers gradually while maintaining availability +- Automatically rolls back if health checks fail + +**Update behavior:** +- Services are updated one replica at a time (configurable) +- New replicas must pass health checks before old ones are removed +- Traffic continues flowing to healthy replicas during updates +- Failed updates trigger automatic rollback to previous version + +**Manual service updates:** +For granular control, update individual services: + +```bash +# Update specific service to new image version +docker service update --image /: + +# Update with custom rollback settings +docker service update \ + --update-parallelism 2 \ + --update-delay 10s \ + --update-failure-action rollback \ + +``` + +**Best practices:** +- Test updates in staging environment first +- Perform updates during low-traffic periods when possible +- Monitor service logs and metrics during updates +- Keep previous image versions available for quick rollback + +## Production NGINX reverse proxy + +NGINX serves as the edge layer, handling TLS termination, API routing, WebSocket protocol upgrades, and proxy buffering for optimal performance. + +**Configuration files to update:** + +Each service has a dedicated NGINX configuration file that must be updated with your production domain: + +- `chatapi.conf` - Chat API routing and WebSocket upgrade rules +- `extensions.conf` - Extensions API routing +- `mgmtapi.conf` - Management API routing +- `notifications.conf` - Notifications service routing +- `dashboard.conf` - Frontend dashboard routing +- `globalwebhooks.conf` - Webhooks service routing +- `moderation.conf` - Moderation service routing +- `websocket.conf` - WebSocket gateway routing + +**Key NGINX responsibilities:** +- **TLS/SSL Termination**: Decrypt HTTPS traffic at the edge, communicate with backend services over encrypted overlay network +- **Load Balancing**: Distribute requests across service replicas using round-robin or least-connections algorithms +- **WebSocket Upgrades**: Handle HTTP to WebSocket protocol upgrades for real-time connections +- **Request Buffering**: Buffer client requests to protect backend services from slow clients +- **Rate Limiting**: Protect services from abuse and DDoS attacks (configure as needed) + +**TLS certificate configuration:** +Place your TLS certificates in the NGINX configuration directory and reference them in each `.conf` file: + +```nginx +ssl_certificate /path/to/your/certificate.crt; +ssl_certificate_key /path/to/your/private.key; +``` + +**Important**: Reload NGINX after updating certificates to apply changes without downtime: + +```bash +docker service update --force nginx +``` + +**Security recommendations:** +- Use TLS 1.2 or higher only +- Configure strong cipher suites +- Enable HSTS (HTTP Strict Transport Security) +- Implement rate limiting for public endpoints + +## Domain configuration (production) + +Production deployments require updating domain references across multiple services and configuration files. Replace all instances of `` with your actual production domain. + +**Services requiring domain configuration:** +- Chat API - Core messaging service endpoints +- Management API - Administrative and configuration endpoints +- Extensions - Custom extension endpoints +- Notifications - Push notification service +- Moderation - Content moderation service +- Webhooks - Outbound webhook delivery +- SQL Consumer - Database migration and provisioning service +- NGINX - Reverse proxy routing rules + +**Critical alignment:** +Ensure the WebSocket host configured in Chat API matches your chosen domain exactly (e.g., `websocket.chat.example.com`). Mismatched domains will cause WebSocket connection failures. + +**Configuration checklist:** +1. Update all service environment variables with production domains +2. Update NGINX configuration files with matching domains +3. Verify DNS records point to your cluster load balancer +4. Configure TLS certificates for all public-facing domains +5. Test domain resolution from external networks + +**Example domain structure:** +``` +api.example.com → Chat API +apiclient.example.com → Client API +apimgmt.example.com → Management API +websocket.example.com → WebSocket Gateway +notifications.example.com → Notifications Service +webhooks.example.com → Webhooks Service +moderation.example.com → Moderation Service +app.example.com → Dashboard +``` + +**Note**: Your actual deployment may use region-specific naming (e.g., `api-us.example.com`) or other conventions. Ensure consistency across all configuration files and DNS records. + +For detailed environment variable mappings, see the [Configuration Reference](./configuration-reference). + +## Useful production commands (Swarm operations) + +### General service management + +```bash +# List all nodes in the cluster +docker node ls + +# List all services and their status +docker service ls + +# View detailed service information +docker service inspect + +# Check service replica status and placement +docker service ps + +# Follow service logs in real-time +docker service logs -f + +# Execute command in a service container +docker exec -it bash + +# Drain a node for maintenance (stops scheduling new tasks) +docker node update --availability drain + +# Return node to active status +docker node update --availability active + +# Scale a service to more replicas +docker service scale =3 +``` + +### Stack operations + +```bash +# Deploy or update entire stack +docker stack deploy -c docker-compose.yml cometchat + +# Remove entire stack (WARNING: destructive) +docker stack rm cometchat + +# List all services in a stack +docker stack services cometchat + +# List all tasks in a stack +docker stack ps cometchat + +# View stack configuration +docker stack config -c docker-compose.yml +``` + +### Troubleshooting commands + +```bash +# View service events and errors +docker service ps --no-trunc + +# Inspect service configuration +docker service inspect --pretty + +# Check service update status +docker service inspect --format='{{json .UpdateStatus}}' + +# View container resource usage +docker stats + +# Inspect network configuration +docker network inspect + +# View volume information +docker volume ls +docker volume inspect +``` + +## Health check endpoints + +After deployment, verify all services are healthy by checking their health endpoints: + +| Component | URL | +| --- | --- | +| Dashboard | `https://app.example.com` | +| Chat API | `https://api-us.example.com/health-check` | +| Client API | `https://apiclient-us.example.com/health-check` | +| Management API | `https://apimgmt.example.com/health-check` | +| Notifications | `https://notifications-us.example.com/health-check` | +| Moderation | `https://rule-us.example.com/health` | +| WebSocket | `https://websocket-us.example.com/v1/health` | +| Webhooks | `https://webhooks-us.example.com/v1/webhooks/health-check` | + +Replace `example.com` with your actual domain, such as `chat.example.com`. + +**Health check validation:** + +```bash +# Check all health endpoints +curl -f https://api-us.example.com/health-check || echo "Chat API unhealthy" +curl -f https://websocket-us.example.com/v1/health || echo "WebSocket unhealthy" +curl -f https://notifications-us.example.com/health-check || echo "Notifications unhealthy" +``` + +**Expected responses:** +- HTTP 200 status code indicates healthy service +- HTTP 503 or connection errors indicate service issues requiring investigation + +**Automation**: Health checks should be integrated into monitoring and automated validation scripts. See the [Monitoring](./monitoring) guide for comprehensive health monitoring setup. + diff --git a/fundamentals/cometchat-on-prem/docker/quick-start.mdx b/fundamentals/cometchat-on-prem/docker/quick-start.mdx new file mode 100644 index 00000000..26aeda57 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/quick-start.mdx @@ -0,0 +1,54 @@ +--- +title: "Quick Start (Local Deployment)" +sidebarTitle: "Quick Start" +--- + +Run the platform locally on a single machine with Docker Compose for development and QA. This setup is not hardened for production workloads. + +## Install Git + +```bash +sudo apt update -y +sudo apt install git -y +``` + +## Clone the repository + +```bash +git clone https://github.com/cometchat-team/cometchat-chat-api-infra-backend.git +cd cometchat-chat-api-infra-backend +git switch docker-swarm-setup-x86 +``` + +## Install Docker & Docker Compose (local only) + +Use the included script, then verify the installation: + +```bash +./install-docker-ubuntu.sh +docker --version +docker-compose --version +``` + +## Start the local Docker environment + +```bash +docker compose pull +docker compose up -d +``` + +Result: all services required for local development start on one host. + +## Validate the local deployment + +- WebSocket Gateway: `http://localhost/v1/health` +- Chat API health check: `http://localhost/health` + +## What this local setup includes + +- Kafka +- Redis +- MongoDB +- Local Docker volumes for persistence + +> Intended for development, debugging, and API testing only. Do not use this environment for production traffic. diff --git a/fundamentals/cometchat-on-prem/docker/scaling.mdx b/fundamentals/cometchat-on-prem/docker/scaling.mdx new file mode 100644 index 00000000..7117327e --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/scaling.mdx @@ -0,0 +1,191 @@ +--- +title: "Scaling" +sidebarTitle: "Scaling" +--- + +Guidelines for scaling platform components based on load and resource requirements. Proper scaling ensures optimal performance, cost efficiency, and user experience as your deployment grows. + +**Scaling Strategies:** +- **Vertical scaling**: Increase resources (CPU, RAM, storage) on existing nodes +- **Horizontal scaling**: Add more service replicas or nodes to distribute load +- **Capacity planning**: Proactively scale based on growth projections and monitoring data + +**When to scale:** +- CPU utilization consistently above 70% +- Memory usage approaching 85% +- API latency exceeding SLA targets (P95 > 100ms) +- WebSocket connection limits approaching capacity +- Database query performance degrading + +## Vertical scaling + +Increase system resource limits and tune configurations to handle more load on existing servers. Vertical scaling is often the first step before adding more nodes. + +**Benefits:** +- Simpler than horizontal scaling (no distributed system complexity) +- Immediate performance improvement +- Lower operational overhead + +**Limitations:** +- Hardware limits (maximum CPU, RAM per server) +- Single point of failure remains +- Downtime required for hardware upgrades + +**Key optimizations:** +- Raise file descriptor limits for high-concurrency workloads +- Tune kernel network queues (`somaxconn`, `netdev_max_backlog`) +- Increase worker processes and thread pools where supported +- Allocate more CPU and memory to Docker services + +### Configure file descriptor limits + +1. Edit `/etc/security/limits.conf` and add: + +``` +* soft nofile 500000 +* hard nofile 500000 +root soft nofile 500000 +root hard nofile 500000 +``` + +2. Configure systemd defaults: + +```bash +echo "DefaultLimitNOFILE=500000" | sudo tee -a /etc/systemd/system.conf +echo "DefaultLimitNOFILE=500000" | sudo tee -a /etc/systemd/user.conf +``` + +3. Reboot to apply changes: + +```bash +sudo reboot +``` + +4. Verify: + +```bash +ulimit -n +``` + +### Allocate more resources to Docker services + +Increase CPU and memory limits for services experiencing resource constraints: + +```bash +# Update service resource limits +docker service update \ + --limit-cpu 4 \ + --limit-memory 8G \ + chat-api + +# Verify updated limits +docker service inspect chat-api --format='{{.Spec.TaskTemplate.Resources.Limits}}' +``` + +## Horizontal scaling + +Add more service replicas or nodes to distribute load across multiple servers. Horizontal scaling provides better fault tolerance and unlimited growth potential. + +### Scaling application services + +**WebSocket Gateway:** +- **Scaling ratio**: Add ~1 replica per 1,000-1,500 peak concurrent connections (PCC) +- **Command**: `docker service scale websocket=5` +- **Considerations**: Ensure load balancer distributes connections evenly; sticky sessions if needed are typically handled at the NGINX layer using IP hash or consistent hashing + +**Chat API:** +- **Scaling trigger**: Scale out when average CPU utilization exceeds ~60% +- **Command**: `docker service scale chat-api=5` +- **Considerations**: Stateless design allows unlimited horizontal scaling + +**Notifications Service:** +- **Scaling trigger**: High push notification queue depth or processing delays +- **Command**: `docker service scale notifications=3` + +**Webhooks Service:** +- **Scaling trigger**: Webhook delivery delays or high retry rates +- **Command**: `docker service scale webhooks=3` + +### Scaling data stores + +**Kafka:** +- **Scaling method**: Increase partition count to improve throughput and parallelism +- **Command**: + ```bash + kafka-topics --alter --topic \ + --partitions \ + --bootstrap-server + ``` +- **Considerations**: More partitions = more parallelism, but also more overhead; balance based on workload. Avoid frequent partition changes during peak traffic to prevent rebalance storms. + +**Redis:** +- **Scaling trigger**: Enable Redis Cluster mode when deployments exceed ~200k MAU +- **Benefits**: Distributes data across multiple nodes, improves scalability and fault tolerance +- **⚠️ Warning**: Redis Cluster mode is not backward-compatible with standalone Redis. Migration requires application awareness and careful testing. + +**TiDB/TiKV:** +- **Scaling method**: Add more TiKV nodes to distribute data and increase storage capacity +- **Command**: Add nodes to cluster using TiUP +- **Considerations**: TiDB automatically rebalances data across new nodes + +**MongoDB:** +- **Scaling method**: Enable sharding for horizontal data distribution +- **⚠️ Warning**: Shard key selection is critical and effectively irreversible. Poor shard keys can cause uneven data distribution and performance issues. + +### Monitoring scaling effectiveness + +After scaling, monitor these metrics to validate improvements: + +- **CPU and memory utilization**: Should decrease proportionally +- **API latency**: P95 and P99 should improve +- **Error rates**: Should remain stable or decrease +- **Throughput**: Requests per second should increase +- **Connection counts**: Should distribute evenly across replicas + +**Important**: If metrics do not improve within 10–15 minutes, reassess scaling assumptions or investigate downstream bottlenecks. + +## When to migrate to Kubernetes + +Docker Swarm is recommended for most deployments up to ~200k MAU. Consider migrating to Kubernetes when you need advanced orchestration features or exceed Swarm's practical limits. + +**Kubernetes migration triggers:** + +- **Scale**: MAU exceeds ~200k or peak concurrent connections exceed ~20k +- **Multi-region**: You need active-active deployments across multiple geographic regions +- **Latency requirements**: Sub-50ms latency targets requiring advanced traffic management +- **Autoscaling**: Dynamic autoscaling based on custom metrics (HPA/VPA) is critical +- **Service mesh**: You need mTLS, advanced traffic routing, or observability features (Istio, Linkerd) +- **Cloud-native tooling**: You want to leverage Kubernetes-native tools and operators + +**Kubernetes benefits:** +- Unlimited horizontal scalability with automated capacity management +- Advanced autoscaling (Horizontal Pod Autoscaler, Vertical Pod Autoscaler) +- Multi-region active-active deployments with global load balancing +- Service mesh integration for mTLS and advanced traffic management +- Rich ecosystem of operators and tools (Kafka operators, database operators) +- GitOps workflows for declarative infrastructure management + +**Migration considerations:** +- Higher operational complexity and learning curve +- More infrastructure overhead (control plane, etcd, etc.) +- Requires Kubernetes expertise on the team +- Migration effort for existing deployments + +**Next steps for Kubernetes:** + +Our solutions team provides Kubernetes reference architectures, migration planning, and ongoing operational guidance tailored to your specific requirements. + +**Contact Enterprise Solutions:** +- **Email**: enterprise@cometchat.com +- **Schedule consultation**: [Contact Form](https://www.cometchat.com/contact) +- **Response time**: Typically within 1 business day + +**What to prepare:** +- Current or projected MAU and PCC +- Geographic distribution requirements +- Compliance requirements (GDPR, HIPAA, SOC 2) +- Existing infrastructure and Kubernetes experience +- Timeline and deployment goals + +For detailed Kubernetes deployment information, see the [Kubernetes Overview](../kubernetes/overview). + diff --git a/fundamentals/cometchat-on-prem/docker/security.mdx b/fundamentals/cometchat-on-prem/docker/security.mdx new file mode 100644 index 00000000..3eae8f45 --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/security.mdx @@ -0,0 +1,219 @@ +--- +title: "Security" +sidebarTitle: "Security" +--- + +Security controls focus on authentication, secrets management, network isolation, TLS posture, and protective controls against abuse. + +## Authentication + +Use JWT-based authentication with RSA key pairs so only authorized users interact with the platform. Single sign-on via OIDC or SAML 2.0 can be layered on when needed. + +Actionable steps: +- Generate an RSA key pair (example): + +```bash +openssl genpkey -algorithm RSA -out private.key -pkeyopt rsa_keygen_bits:3072 +openssl rsa -pubout -in private.key -out public.key +``` + +- Validate JWTs in your backend with the public key. +- Rotate signing keys every 30-90 days for long-running deployments. + +## Secrets management + +Centralize and encrypt sensitive data such as passwords, API keys, database credentials, and tokens. Avoid storing secrets directly in environment variables or committing them to configuration files. Use secrets managers or mounted secret files instead. + +### HashiCorp Vault (recommended for enterprise) + +Vault provides enterprise-grade secrets management with RBAC, audit logging, and encrypted storage: + +**Key features:** +- Dynamic secrets with automatic rotation +- Fine-grained access control policies +- Comprehensive audit trails for compliance +- Encryption as a service +- Multi-cloud and hybrid support + +**Basic usage:** + +```bash +# Store a secret +vault kv put secret/database/password value="your-secure-password" + +# Retrieve a secret +vault kv get secret/database/password + +# Rotate a secret +vault kv put secret/database/password value="new-secure-password" +``` + +**Integration with applications:** +- Use Vault APIs or client libraries to retrieve secrets at runtime +- Implement automatic secret rotation without service restarts +- Configure lease durations and renewal policies +- Enable audit logging for all secret access + +### Docker Swarm secrets + +For simpler deployments, use Docker Swarm's built-in secrets management: + +**Create and use secrets:** + +```bash +# Create a secret from stdin +echo "your-secure-password" | docker secret create db_password - + +# Create a secret from file +docker secret create tls_cert ./certificate.crt + +# List secrets +docker secret ls + +# Use secret in service +docker service create \ + --name myservice \ + --secret db_password \ + myimage:latest +``` + +**Access secrets in containers:** +Secrets are mounted as files in `/run/secrets/` directory: + +```bash +# Read secret in application +cat /run/secrets/db_password +``` + +**Security properties:** +- Encrypted in transit and at rest +- Only accessible to authorized services +- Automatically removed when service is deleted +- Immutable (must delete and recreate to update) + +### Best practices + +- **Principle of least privilege**: Grant secrets access only to services that need them +- **Regular rotation**: Rotate secrets on a defined schedule (30-90 days) +- **Audit logging**: Monitor and log all secret access for security analysis +- **Backup and recovery**: Maintain secure backups of secrets management systems +- **Separation of duties**: Require multiple approvals for sensitive secret operations + +## Network security + +Run backend services on private overlay networks and expose only NGINX to the internet. + +Actionable steps: +- Create a private overlay network: + +```bash +docker network create --driver overlay private_network +``` + +- Harden firewall rules to allow only the necessary ports (80/443) and block the rest. Apply firewall rules at the host or network perimeter level, not inside containers: + +```bash +sudo ufw allow 80,443/tcp +sudo ufw default deny incoming +sudo ufw enable +``` + +## TLS configuration + +Encrypt all traffic with modern TLS protocols and strong cipher suites. + +### NGINX TLS configuration + +Configure NGINX with security best practices: + +```nginx +# Use only modern TLS versions +ssl_protocols TLSv1.2 TLSv1.3; + +# Strong cipher suites (prioritize forward secrecy) +ssl_ciphers 'ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305'; +ssl_prefer_server_ciphers off; + +# Enable HSTS (HTTP Strict Transport Security) +add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always; + +# Disable SSL session tickets for better security +ssl_session_tickets off; + +# OCSP stapling for certificate validation +ssl_stapling on; +ssl_stapling_verify on; +resolver 8.8.8.8 8.8.4.4 valid=300s; # Use internal DNS resolvers where required by corporate security policies +resolver_timeout 5s; + +# DH parameters for forward secrecy +ssl_dhparam /path/to/dhparam.pem; +``` + +### Certificate management + +**Obtain certificates:** +- **Let's Encrypt**: Free, automated certificates with 90-day validity +- **Commercial CA**: Extended validation (EV) certificates for enterprise trust +- **Internal CA**: For internal services and development environments + +**Certificate rotation strategy:** + +```bash +# Generate DH parameters (one-time, takes several minutes) +openssl dhparam -out /etc/nginx/dhparam.pem 2048 + +# Maintain multiple certificates for zero-downtime rotation +# 1. Obtain new certificate +# 2. Update NGINX configuration with new certificate path +# 3. Test configuration +nginx -t + +# 4. Reload NGINX (zero downtime) +docker service update --force nginx +``` + +**Automation with Let's Encrypt:** +- Use Certbot or similar tools for automatic renewal +- Configure renewal hooks to reload NGINX after certificate updates +- Monitor certificate expiration dates (alert 30 days before expiry) + +### Internal service encryption + +**Docker Swarm overlay network encryption:** + +```bash +# Create encrypted overlay network +docker network create \ + --driver overlay \ + --opt encrypted \ + secure_backend +``` + +**Benefits:** +- Automatic encryption of all inter-service communication +- No application code changes required +- Protects against network sniffing within the cluster + +**Database connections:** +- Enable TLS for TiDB, MongoDB, and Redis connections +- Use certificate-based authentication where supported +- Encrypt backup data in transit and at rest + +## Additional security measures + +- Rate limiting: protect against abuse or DDoS using NGINX rate limits. Tune rate limits based on expected traffic patterns to avoid impacting legitimate users: + +```nginx +limit_req_zone $binary_remote_addr zone=mylimit:10m rate=10r/s; +limit_req zone=mylimit burst=20; +``` + +- IP allowlisting: restrict access to sensitive services: + +```bash +sudo ufw allow from 192.168.1.0/24 to any port 3306 +``` + +- Log monitoring: collect and monitor logs (e.g., Prometheus alerts, Grafana dashboards, ELK/Loki) to detect suspicious activity. + diff --git a/fundamentals/cometchat-on-prem/docker/troubleshooting.mdx b/fundamentals/cometchat-on-prem/docker/troubleshooting.mdx new file mode 100644 index 00000000..0e9d342d --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/troubleshooting.mdx @@ -0,0 +1,43 @@ +--- +title: "Troubleshooting" +sidebarTitle: "Troubleshooting" +--- + +Common operational issues and debugging guidance. + +## Common problems and likely causes + +### 502 errors +- Possible cause: Chat API unreachable or unhealthy behind NGINX. +- Resolution: + - Ensure the Chat API service is running: `docker service ps ` + - Check NGINX logs and upstream configuration to verify routing and upstream health. + +### Kafka lag +- Possible cause: Consumer slowdown or insufficient partition count. +- Resolution: + - Check Kafka consumer lag: `kafka-consumer-groups --describe --group --bootstrap-server ` + - Increase partitions if needed: `kafka-topics --alter --partitions --topic --bootstrap-server ` + +### Redis eviction +- Possible cause: Memory pressure or incorrect eviction policy. +- Resolution: + - Inspect memory settings: `redis-cli config get maxmemory` and `redis-cli config get maxmemory-policy` + - Set an eviction policy such as `redis-cli config set maxmemory-policy allkeys-lru` + +### TiKV region errors +- Possible cause: Disk latency, resource contention, or store imbalance. +- Resolution: + - Check TiKV store status: `tiup cluster display` + - Rebalance regions if needed: `tiup cluster restart --force` + +## Debugging commands + +### Container and Swarm diagnostics +- View container logs: `docker logs ` +- Check service status: `docker service ps ` +- Inspect container details: `docker inspect ` + +### TiDB cluster status +- Display cluster status with TiUP: `tiup cluster display ` + diff --git a/fundamentals/cometchat-on-prem/docker/upgrades.mdx b/fundamentals/cometchat-on-prem/docker/upgrades.mdx new file mode 100644 index 00000000..4e2d97cf --- /dev/null +++ b/fundamentals/cometchat-on-prem/docker/upgrades.mdx @@ -0,0 +1,197 @@ +--- +title: "Upgrades" +sidebarTitle: "Upgrades" +--- + +This document outlines the recommended upgrade strategy to ensure zero downtime and safe production rollouts. + +## Required inputs + +Before starting the upgrade, ensure you have: + +- Target release version (e.g., v3.9.52) +- Container registry access credentials +- Access to the Swarm manager node + +## Pre-upgrade checklist + +Before performing any upgrade: + +1. **Backup critical data**: + - Database snapshots (TiDB/TiKV) + - Redis persistence files + - Configuration files + +2. **Verify current system health**: + ```bash + docker service ls + docker stack ps + ``` + +3. **Test the upgrade in a staging environment** first + +4. **Schedule maintenance window** (if needed for major upgrades) + +5. **Notify team members** of the upgrade + +## Upgrade execution steps + +Upgrades should be performed during low-traffic periods whenever possible. + +### Step 1: Pull latest images + +```bash +docker pull /chat-api: +docker pull /websocket-gateway: +# Pull other service images as needed +``` + +### Step 2: Apply updates using the update script + +To update services without full redeploy: + +```bash +./update.sh +``` + +This script is provided as part of the CometChat On-Prem deployment and must be executed from the Swarm manager node. It performs rolling updates of services while maintaining availability and honoring Docker Swarm update policies. + +### Step 3: Monitor the rollout + +Watch service updates in real-time: + +```bash +docker service ps --no-trunc +docker service logs -f +``` + +### Step 4: Verify health checks + +Ensure new replicas pass health checks: + +```bash +docker service inspect --format='{{json .UpdateStatus}}' +``` + +## Rolling updates + +Docker Swarm performs rolling updates automatically when using `./update.sh` or manual service updates: + +```bash +# Example for updating a single service +docker service update --image /chat-api: +``` + +The process: +- Deploy new service replicas alongside existing ones +- Gradually shift traffic to the updated replicas +- Retire older replicas only after health checks pass + +### Configure update behavior + +Control rolling update parameters: + +```bash +docker service update \ + --update-parallelism 2 \ + --update-delay 10s \ + --update-failure-action rollback \ + +``` + +## Database migrations + +### Step 1: Backup database + +```bash +# TiDB backup example +tiup br backup full --pd --storage "local:///backup/$(date +%Y%m%d)" +``` + +### Step 2: Test migration in staging + +Always test migrations in a staging environment before production. + +### Step 3: Run migration + +```bash +# Example migration command (adjust for your setup) +docker exec -it npm run migrate +``` + +**Important:** Ensure only one instance runs migrations to avoid concurrent schema changes. + +### Best practices + +- Prefer backward-compatible schema changes +- Avoid dropping or renaming columns while serving live traffic +- Use feature flags to decouple code deployment from schema changes +- Keep migrations idempotent (safe to run multiple times) + +## Post-upgrade verification + +After upgrade completes: + +1. **Check service status**: + ```bash + docker service ls + docker service ps + ``` + +2. **Verify application health**: + ```bash + curl http:///health + ``` + +3. **Monitor logs for errors**: + ```bash + docker service logs --tail 100 + ``` + +4. **Check metrics and dashboards** (Prometheus/Grafana) for latency, error rates, and resource spikes + +5. **Test critical user flows** (login, messaging, etc.) + +## Rollback procedures + +If issues are detected after upgrade: + +### Automatic rollback + +Docker Swarm can automatically rollback on failure: + +```bash +docker service update --rollback +``` + +### Manual rollback to previous image + +```bash +docker service update --image /chat-api: +``` + +### Database rollback + +If database migration needs reverting: + +```bash +# Run down migration (adjust for your setup) +docker exec -it npm run migrate:down +``` + +Or restore from backup: + +```bash +# TiDB restore example +tiup br restore full --pd --storage "local:///backup/" +``` + +### Rollback checklist + +1. Revert service images to previous versions +2. Rollback database migrations if schema changed +3. Restore configuration files if modified +4. Validate application behavior and data integrity +5. Monitor the system for 15–30 minutes before restoring full traffic +6. Document the issue and root cause for post-mortem + diff --git a/fundamentals/cometchat-on-prem/kubernetes/overview.mdx b/fundamentals/cometchat-on-prem/kubernetes/overview.mdx new file mode 100644 index 00000000..8a06d554 --- /dev/null +++ b/fundamentals/cometchat-on-prem/kubernetes/overview.mdx @@ -0,0 +1,103 @@ +--- +title: "Kubernetes Deployment" +sidebarTitle: "Overview" +--- + +CometChat On-Prem on Kubernetes provides enterprise-grade orchestration for large-scale deployments requiring advanced features like multi-region active-active architectures, dynamic autoscaling, and service mesh integration. + +## When to use Kubernetes + +Kubernetes is recommended for deployments that meet one or more of these criteria: + +**Scale Requirements:** +- Monthly Active Users (MAU) exceeding 200,000 +- Peak Concurrent Connections (PCC) exceeding 20,000 +- Multi-region or global deployments with active-active failover + +**Advanced Features:** +- Dynamic autoscaling based on custom metrics (HPA/VPA) +- Service mesh for mTLS and advanced traffic management (Istio, Linkerd) +- GitOps workflows for declarative infrastructure management +- Cloud-native operators for Kafka, databases, and other components + +**Operational Requirements:** +- Sub-50ms latency targets requiring sophisticated traffic routing +- Compliance requirements demanding advanced security policies +- Integration with existing Kubernetes infrastructure + +## Kubernetes vs Docker Swarm + +| Feature | Docker Swarm | Kubernetes | +| --- | --- | --- | +| **Recommended Scale** | Up to 200k MAU | 200k+ MAU | +| **Operational Complexity** | Low | High | +| **Learning Curve** | Minimal | Steep | +| **Autoscaling** | Manual | Advanced (HPA/VPA) | +| **Multi-region** | Manual setup | Native support | +| **Service Mesh** | Not supported | Native integration | +| **Ecosystem** | Limited | Extensive | +| **Best For** | Mid-market, simplicity | Enterprise, scale | + +## Architecture highlights + +**Kubernetes deployment includes:** +- Horizontal Pod Autoscaler (HPA) for automatic scaling based on CPU, memory, or custom metrics +- Vertical Pod Autoscaler (VPA) for right-sizing pod resource requests +- Multi-region active-active deployments with global load balancing +- Service mesh integration (Istio, Linkerd) for mTLS, traffic management, and observability +- Cloud-native Kafka operators (Strimzi, Confluent) for automated cluster management +- Advanced security policies with Pod Security Standards and Network Policies +- GitOps workflows with ArgoCD or Flux for declarative infrastructure management + +## Enterprise support + +Kubernetes deployments require specialized expertise and planning. Our solutions team provides: + +**Architecture & Planning:** +- Custom reference architectures tailored to your requirements +- Capacity planning and sizing recommendations +- Multi-region deployment strategies +- Migration planning from Docker Swarm or other platforms + +**Implementation Support:** +- Helm charts and Kubernetes manifests +- CI/CD pipeline integration +- Service mesh configuration +- Monitoring and observability setup + +**Ongoing Operations:** +- Operational runbooks and best practices +- Scaling and performance optimization +- Disaster recovery planning +- Security hardening and compliance guidance + +## Contact us + +To discuss Kubernetes deployment for your organization: + +**Enterprise Sales & Solutions:** +- Email: enterprise@cometchat.com +- Schedule a consultation: [Contact Form](https://www.cometchat.com/contact) + +**What to prepare for the consultation:** +- Current or projected Monthly Active Users (MAU) +- Peak Concurrent Connections (PCC) estimates +- Geographic distribution requirements +- Compliance and security requirements (GDPR, HIPAA, SOC 2, etc.) +- Existing infrastructure (cloud provider, Kubernetes experience) +- Timeline and deployment goals + +**Response time:** Our solutions team typically responds within 1 business day for enterprise inquiries. + +--- + +## Docker Swarm documentation + +For deployments under 200k MAU, we recommend starting with Docker Swarm for simplicity and lower operational overhead: + +- [Docker Swarm Overview](../docker/overview) - Complete platform overview and architecture +- [Prerequisites](../docker/prerequisites) - Hardware and software requirements +- [Production Deployment](../docker/production-deployment) - Step-by-step deployment guide +- [Scaling](../docker/scaling) - Scaling strategies and Kubernetes migration guidance + +Docker Swarm provides production-grade reliability with significantly lower complexity, making it ideal for most enterprise deployments. diff --git a/images/docker-on-prem-architecture.png b/images/docker-on-prem-architecture.png new file mode 100644 index 00000000..8f168a45 Binary files /dev/null and b/images/docker-on-prem-architecture.png differ diff --git a/images/icons/docker.svg b/images/icons/docker.svg new file mode 100644 index 00000000..eba6cc41 --- /dev/null +++ b/images/icons/docker.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/images/icons/kubernetes.svg b/images/icons/kubernetes.svg new file mode 100644 index 00000000..bedd3b88 --- /dev/null +++ b/images/icons/kubernetes.svg @@ -0,0 +1,84 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + +