summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rwxr-xr-xbin/clean27
-rwxr-xr-xbin/dev_secrets30
-rwxr-xr-xbin/down4
-rwxr-xr-xbin/prod_secrets35
-rwxr-xr-xbin/test218
-rwxr-xr-xbin/up24
-rw-r--r--docker-compose.yml110
-rw-r--r--searxng/settings.yml (renamed from settings.yml)0
9 files changed, 436 insertions, 13 deletions
diff --git a/.gitignore b/.gitignore
index feead5b..c8745f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
reference/
+tmp/
diff --git a/bin/clean b/bin/clean
new file mode 100755
index 0000000..a87e386
--- /dev/null
+++ b/bin/clean
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Stop and remove containers, volumes, and orphaned containers.
+# Pass --images to also remove pulled images (forces a fresh re-pull on next start).
+
+REMOVE_IMAGES=false
+for arg in "$@"; do
+ [[ "$arg" == "--images" ]] && REMOVE_IMAGES=true
+done
+
+sudo docker compose down --volumes --remove-orphans
+sudo docker compose rm -f
+
+if [ "$REMOVE_IMAGES" = "true" ]; then
+ echo "Removing cached images..."
+ sudo docker image rm \
+ ghcr.io/firecrawl/firecrawl:latest \
+ ghcr.io/firecrawl/playwright-service:latest \
+ docker.io/searxng/searxng:latest \
+ postgres:17-alpine \
+ redis:alpine 2>/dev/null || true
+ echo "Images removed."
+fi
+
+echo "Done. Run bin/up to start fresh."
+
diff --git a/bin/dev_secrets b/bin/dev_secrets
new file mode 100755
index 0000000..ebc9bb0
--- /dev/null
+++ b/bin/dev_secrets
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export GPG_TTY=$(tty)
+
+function ensure_secret() {
+ local path=$1
+ local description=$2
+ local generate=${3:-false}
+
+ if ! gopass show "$path" >/dev/null 2>&1; then
+ echo "Missing secret: $description ($path)"
+ if [ "$generate" = "true" ]; then
+ local val=$(openssl rand -hex 32)
+ gopass insert -f "$path" <<< "$val"
+ echo "Generated and stored."
+ else
+ echo "Please enter the value for $description:"
+ read -rs val
+ gopass insert -f "$path" <<< "$val"
+ fi
+ fi
+}
+
+ensure_secret "projects/firecrawl-dokploy/dev/api_key" "Firecrawl API Key" true
+ensure_secret "projects/firecrawl-dokploy/dev/bull_auth_key" "Bull Auth Key" true
+ensure_secret "projects/firecrawl-dokploy/dev/postgres_password" "PostgreSQL Password" true
+ensure_secret "projects/firecrawl-dokploy/dev/openai_api_key" "OpenAI API Key (optional, press enter to skip)" false
+
+echo "Dev secrets ensured."
diff --git a/bin/down b/bin/down
new file mode 100755
index 0000000..38d67fc
--- /dev/null
+++ b/bin/down
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+sudo docker compose down "$@"
diff --git a/bin/prod_secrets b/bin/prod_secrets
new file mode 100755
index 0000000..ca9f7f1
--- /dev/null
+++ b/bin/prod_secrets
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export GPG_TTY=$(tty)
+
+function get_or_gen_secret() {
+ local path=$1
+ local generate=${2:-false}
+ local description=$3
+
+ if ! gopass show "$path" >/dev/null 2>&1; then
+ if [ "$generate" = "true" ]; then
+ local val=$(openssl rand -hex 32)
+ gopass insert -f "$path" <<< "$val"
+ else
+ echo >&2 "Missing required secret: $description ($path)"
+ echo >&2 "Please enter the value:"
+ read -rs val
+ gopass insert -f "$path" <<< "$val"
+ fi
+ fi
+ gopass show -o "$path"
+}
+
+TEST_API_KEY=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/api_key" true "Firecrawl API Key")
+BULL_AUTH_KEY=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/bull_auth_key" true "Bull Auth Key")
+POSTGRES_PASSWORD=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/postgres_password" true "PostgreSQL Password")
+OPENAI_API_KEY=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/openai_api_key" false "OpenAI API Key")
+
+cat <<EOF
+TEST_API_KEY=$TEST_API_KEY
+BULL_AUTH_KEY=$BULL_AUTH_KEY
+POSTGRES_PASSWORD=$POSTGRES_PASSWORD
+OPENAI_API_KEY=$OPENAI_API_KEY
+EOF
diff --git a/bin/test b/bin/test
new file mode 100755
index 0000000..c7e9db8
--- /dev/null
+++ b/bin/test
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Force GPG to use terminal-based pinentry (required for SSH sessions)
+export GPG_TTY=$(tty)
+
+# ----------------------------------------------------------------------------
+# bin/test — verify a running Firecrawl deployment by hitting it from the host
+# (i.e. outside the Docker network), simulating an external client
+# such as an AI agent.
+#
+# Usage:
+# bin/test # local dev — hits http://127.0.0.1:3002
+# bin/test https://my.host # remote — hits the given base URL
+#
+# The API key is read from gopass at projects/firecrawl-dokploy/dev/api_key
+# (override by exporting TEST_API_KEY before running).
+# ----------------------------------------------------------------------------
+
+BASE_URL="${1:-http://127.0.0.1:3002}"
+BASE_URL="${BASE_URL%/}"
+
+if [ -z "${TEST_API_KEY:-}" ]; then
+ TEST_API_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/api_key)"
+fi
+
+if [ -z "${TEST_API_KEY:-}" ]; then
+ echo >&2 "ERROR: TEST_API_KEY is empty (gopass returned nothing and env was unset)"
+ exit 1
+fi
+
+# ----------------------------------------------------------------------------
+# Output capture: tee everything to tmp/test-<timestamp>.log AND the most recent
+# run to tmp/test-latest.log, while still printing to the terminal.
+# ----------------------------------------------------------------------------
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+TMP_DIR="$REPO_ROOT/tmp"
+mkdir -p "$TMP_DIR"
+LOG_FILE="$TMP_DIR/test-$(date +%Y%m%d-%H%M%S).log"
+LATEST_LOG="$TMP_DIR/test-latest.log"
+
+# Redirect all stdout+stderr through tee. Use a process substitution so the
+# script keeps its normal exit code.
+exec > >(tee "$LOG_FILE") 2>&1
+
+# Mirror to test-latest.log on exit (atomic copy, not a symlink so it survives
+# `rm tmp/*.log`).
+trap 'cp -f "$LOG_FILE" "$LATEST_LOG" 2>/dev/null || true' EXIT
+
+# ----------------------------------------------------------------------------
+# Pretty output helpers
+# ----------------------------------------------------------------------------
+RED=$'\033[0;31m'
+GREEN=$'\033[0;32m'
+YELLOW=$'\033[0;33m'
+BLUE=$'\033[0;34m'
+BOLD=$'\033[1m'
+RESET=$'\033[0m'
+
+PASS=0
+FAIL=0
+
+section() {
+ printf '\n%s== %s ==%s\n' "$BOLD$BLUE" "$1" "$RESET"
+}
+
+ok() {
+ printf ' %s✓%s %s\n' "$GREEN" "$RESET" "$1"
+ PASS=$((PASS + 1))
+}
+
+bad() {
+ printf ' %s✗%s %s\n' "$RED" "$RESET" "$1"
+ FAIL=$((FAIL + 1))
+}
+
+info() {
+ printf ' %s…%s %s\n' "$YELLOW" "$RESET" "$1"
+}
+
+# ----------------------------------------------------------------------------
+# Issue a JSON request, capture body + status
+# Args: METHOD PATH [JSON_BODY]
+# Sets: HTTP_STATUS, HTTP_BODY
+# ----------------------------------------------------------------------------
+http_call() {
+ local method=$1
+ local path=$2
+ local body=${3:-}
+
+ local tmp
+ tmp=$(mktemp)
+
+ local args=(
+ -sS
+ -o "$tmp"
+ -w '%{http_code}'
+ -X "$method"
+ -H "Authorization: Bearer ${TEST_API_KEY}"
+ -H "Content-Type: application/json"
+ --connect-timeout 10
+ --max-time 120
+ )
+
+ if [ -n "$body" ]; then
+ args+=(-d "$body")
+ fi
+
+ HTTP_STATUS=$(curl "${args[@]}" "${BASE_URL}${path}" || echo "000")
+ HTTP_BODY=$(cat "$tmp")
+ rm -f "$tmp"
+}
+
+# ----------------------------------------------------------------------------
+# Test runner
+# ----------------------------------------------------------------------------
+section "Target"
+info "BASE_URL = ${BASE_URL}"
+info "TEST_API_KEY = ${TEST_API_KEY:0:8}…"
+
+# 1. Reachability ------------------------------------------------------------
+section "1. Reachability"
+http_call GET "/"
+case "$HTTP_STATUS" in
+ 200|404|401)
+ ok "API is reachable (HTTP $HTTP_STATUS at /)"
+ ;;
+ 000)
+ bad "Could not connect to ${BASE_URL} — is the stack running?"
+ echo
+ echo "Hint: run 'bin/up' first, or pass a different base URL."
+ exit 1
+ ;;
+ *)
+ bad "Unexpected HTTP $HTTP_STATUS at /"
+ ;;
+esac
+
+# 2. Auth posture (informational) ------------------------------------------
+# Self-hosted Firecrawl with USE_DB_AUTHENTICATION=false has no built-in
+# bearer-token gate — any token (including a bogus one) is accepted by the
+# API. We probe with a bogus token just to surface this fact in the log.
+section "2. Auth posture (informational)"
+tmp=$(mktemp)
+status=$(curl -sS -o "$tmp" -w '%{http_code}' \
+ -X POST "${BASE_URL}/v1/scrape" \
+ -H "Authorization: Bearer fc-definitely-not-a-real-key" \
+ -H "Content-Type: application/json" \
+ -d '{"url":"https://example.com"}' \
+ --connect-timeout 10 --max-time 60 || echo "000")
+rm -f "$tmp"
+if [ "$status" = "401" ] || [ "$status" = "403" ]; then
+ ok "bogus key rejected with HTTP $status (USE_DB_AUTHENTICATION on?)"
+else
+ info "bogus key returned HTTP $status — self-hosted Firecrawl is open by design; restrict access at Traefik/firewall level"
+fi
+
+# 3. /v1/scrape --------------------------------------------------------------
+section "3. Scrape — POST /v1/scrape https://example.com"
+http_call POST "/v1/scrape" '{"url":"https://example.com","formats":["markdown"]}'
+if [ "$HTTP_STATUS" = "200" ]; then
+ if printf '%s' "$HTTP_BODY" | grep -qi "example domain"; then
+ ok "scrape returned 200 and markdown contains 'Example Domain'"
+ else
+ bad "scrape returned 200 but markdown did not contain 'Example Domain'"
+ echo "$HTTP_BODY" | head -c 400
+ echo
+ fi
+else
+ bad "scrape failed with HTTP $HTTP_STATUS"
+ echo "$HTTP_BODY" | head -c 400
+ echo
+fi
+
+# 4. /v1/search (covers SearXNG + Firecrawl scrape pipeline) -----------------
+section "4. Search — POST /v1/search 'firecrawl github'"
+http_call POST "/v1/search" '{"query":"firecrawl github","limit":3}'
+if [ "$HTTP_STATUS" = "200" ]; then
+ if printf '%s' "$HTTP_BODY" | grep -q '"success":true'; then
+ ok "search returned 200 with success:true (SearXNG + scrape pipeline OK)"
+ else
+ bad "search returned 200 but body lacks success:true"
+ echo "$HTTP_BODY" | head -c 400
+ echo
+ fi
+else
+ bad "search failed with HTTP $HTTP_STATUS"
+ echo "$HTTP_BODY" | head -c 400
+ echo
+fi
+
+# 5. /v1/map -----------------------------------------------------------------
+section "5. Map — POST /v1/map https://example.com"
+http_call POST "/v1/map" '{"url":"https://example.com"}'
+if [ "$HTTP_STATUS" = "200" ]; then
+ ok "map returned 200"
+else
+ bad "map failed with HTTP $HTTP_STATUS"
+ echo "$HTTP_BODY" | head -c 400
+ echo
+fi
+
+# ----------------------------------------------------------------------------
+# Summary
+# ----------------------------------------------------------------------------
+echo
+section "Summary"
+printf " %sPassed:%s %d\n" "$GREEN" "$RESET" "$PASS"
+printf " %sFailed:%s %d\n" "$RED" "$RESET" "$FAIL"
+echo
+
+if [ "$FAIL" -gt 0 ]; then
+ echo "Log: $LOG_FILE"
+ exit 1
+fi
+
+echo "Log: $LOG_FILE"
diff --git a/bin/up b/bin/up
new file mode 100755
index 0000000..1e39076
--- /dev/null
+++ b/bin/up
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Force GPG to use terminal-based pinentry
+export GPG_TTY=$(tty)
+
+# Ensure dokploy-network exists for local dev if not present
+if ! sudo docker network inspect dokploy-network >/dev/null 2>&1; then
+ sudo docker network create dokploy-network
+fi
+
+# Load secrets from gopass
+export TEST_API_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/api_key)"
+export BULL_AUTH_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/bull_auth_key)"
+export POSTGRES_PASSWORD="$(gopass show -o projects/firecrawl-dokploy/dev/postgres_password)"
+export OPENAI_API_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/openai_api_key || echo "")"
+export FIRECRAWL_DOMAIN="firecrawl.localhost"
+
+sudo TEST_API_KEY="$TEST_API_KEY" \
+ BULL_AUTH_KEY="$BULL_AUTH_KEY" \
+ POSTGRES_PASSWORD="$POSTGRES_PASSWORD" \
+ OPENAI_API_KEY="$OPENAI_API_KEY" \
+ FIRECRAWL_DOMAIN="$FIRECRAWL_DOMAIN" \
+ docker compose up "$@"
diff --git a/docker-compose.yml b/docker-compose.yml
index a59d779..44c0f05 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,6 +2,65 @@ name: firecrawl
services:
# ============================================================
+ # PostgreSQL — Firecrawl's NUQ queue store
+ # Uses the firecrawl-published image, which extends postgres:17
+ # with pg_cron preloaded and the nuq schema bootstrapped via
+ # /docker-entrypoint-initdb.d/010-nuq.sql.
+ #
+ # NOTE: pg_cron is pinned to database 'postgres' in the image
+ # (cron.database_name = 'postgres'), so POSTGRES_DB MUST be
+ # 'postgres'. The init script creates the nuq schema in that
+ # database. Do not change POSTGRES_DB / POSTGRES_USER here
+ # unless you also rebuild the nuq-postgres image to match.
+ # ============================================================
+ postgres:
+ image: ghcr.io/firecrawl/nuq-postgres:latest
+ networks:
+ - backend
+ environment:
+ POSTGRES_USER: postgres
+ POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
+ POSTGRES_DB: postgres
+ volumes:
+ - postgres-data:/var/lib/postgresql/data
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U postgres -d postgres"]
+ interval: 10s
+ timeout: 5s
+ retries: 10
+ start_period: 30s
+ logging:
+ driver: "json-file"
+ options:
+ max-size: "5m"
+ max-file: "2"
+ restart: unless-stopped
+
+ # ============================================================
+ # RabbitMQ — required by Firecrawl's NUQ workers
+ # ============================================================
+ rabbitmq:
+ image: rabbitmq:3-management
+ networks:
+ - backend
+ command: rabbitmq-server
+ volumes:
+ - rabbitmq-data:/var/lib/rabbitmq
+ healthcheck:
+ test: ["CMD", "rabbitmq-diagnostics", "-q", "check_running"]
+ interval: 10s
+ timeout: 5s
+ retries: 10
+ start_period: 30s
+ logging:
+ driver: "json-file"
+ options:
+ max-size: "5m"
+ max-file: "2"
+ compress: "true"
+ restart: unless-stopped
+
+ # ============================================================
# SearXNG — metasearch engine (powers Firecrawl's /search API)
# ============================================================
searxng:
@@ -10,7 +69,7 @@ services:
- backend
- dokploy-network
volumes:
- - ./searxng:/etc/searxng:rw
+ - ./searxng/settings.yml:/etc/searxng/settings.yml:ro
- searxng-cache:/var/cache/searxng:rw
environment:
- SEARXNG_BASE_URL=https://${SEARXNG_DOMAIN:-searxng.localhost}/
@@ -31,7 +90,7 @@ services:
# Otherwise it's only reachable internally by Firecrawl.
# labels:
# - "traefik.enable=true"
- # - "traefik.http.routers.searxng.rule=Host(`${SEARXNG_DOMAIN}`)"
+ # - "traefik.http.routers.searxng.rule=Host(`${SEARXNG_DOMAIN:-searxng.localhost}`)"
# - "traefik.http.routers.searxng.entrypoints=websecure"
# - "traefik.http.routers.searxng.tls.certResolver=letsencrypt"
# - "traefik.http.services.searxng.loadbalancer.server.port=8080"
@@ -69,39 +128,56 @@ services:
# Firecrawl API — scrape, crawl, search, map
# ============================================================
api:
- image: ghcr.io/firecrawl/firecrawl
+ image: ghcr.io/firecrawl/firecrawl:latest
networks:
- backend
- dokploy-network
extra_hosts:
- "host.docker.internal:host-gateway"
environment:
- # === Required ===
- PORT: ${PORT:-3002}
- INTERNAL_PORT: ${INTERNAL_PORT:-3002}
+ # === Server ===
HOST: 0.0.0.0
+ PORT: ${INTERNAL_PORT:-3002}
+ INTERNAL_PORT: ${INTERNAL_PORT:-3002}
+ ENV: local
NUM_WORKERS_PER_QUEUE: ${NUM_WORKERS_PER_QUEUE:-8}
+ USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-false}
+ # === Redis ===
REDIS_URL: redis://redis:6379
REDIS_RATE_LIMIT_URL: redis://redis:6379
+ # === Playwright ===
PLAYWRIGHT_MICROSERVICE_URL: http://playwright-service:3000/scrape
- USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-false}
- # === SearXNG (internal, same compose network) ===
+ # === SearXNG (internal) ===
SEARXNG_ENDPOINT: http://searxng:8080
SEARXNG_ENGINES: ${SEARXNG_ENGINES:-}
SEARXNG_CATEGORIES: ${SEARXNG_CATEGORIES:-}
- # === Optional: Auth ===
+ # === NUQ Postgres ===
+ # POSTGRES_HOST != "localhost" puts the harness into docker-compose
+ # mode and prevents it from trying to spawn its own container.
+ POSTGRES_HOST: postgres
+ POSTGRES_PORT: "5432"
+ POSTGRES_USER: postgres
+ POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
+ POSTGRES_DB: postgres
+ # === NUQ RabbitMQ ===
+ # Must be set explicitly — the harness's docker-compose branch
+ # does NOT auto-populate this, it just skips container management.
+ NUQ_RABBITMQ_URL: amqp://rabbitmq:5672
+ # === Auth / Secrets ===
+ BULL_AUTH_KEY: ${BULL_AUTH_KEY:-}
TEST_API_KEY: ${TEST_API_KEY:-}
- BULL_AUTH_KEY: ${BULL_AUTH_KEY:-CHANGEME}
# === Optional: AI Features ===
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
+ OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
MODEL_NAME: ${MODEL_NAME:-}
+ MODEL_EMBEDDING_NAME: ${MODEL_EMBEDDING_NAME:-}
# === Optional: Proxy ===
PROXY_SERVER: ${PROXY_SERVER:-}
PROXY_USERNAME: ${PROXY_USERNAME:-}
PROXY_PASSWORD: ${PROXY_PASSWORD:-}
ports:
- - ${PORT:-3002}
+ - "127.0.0.1:${INTERNAL_PORT:-3002}:${INTERNAL_PORT:-3002}"
ulimits:
nofile:
soft: 65535
@@ -109,10 +185,16 @@ services:
cpus: 4.0
mem_limit: 8G
memswap_limit: 8G
+ # --start-docker = run pre-built dist/ directly, skip pnpm install
+ # and skip the container-management code paths in harness.ts.
command: node dist/src/harness.js --start-docker
depends_on:
redis:
condition: service_healthy
+ postgres:
+ condition: service_healthy
+ rabbitmq:
+ condition: service_healthy
playwright-service:
condition: service_started
searxng:
@@ -122,7 +204,7 @@ services:
- "traefik.http.routers.firecrawl-api.rule=Host(`${FIRECRAWL_DOMAIN}`)"
- "traefik.http.routers.firecrawl-api.entrypoints=websecure"
- "traefik.http.routers.firecrawl-api.tls.certResolver=letsencrypt"
- - "traefik.http.services.firecrawl-api.loadbalancer.server.port=${PORT:-3002}"
+ - "traefik.http.services.firecrawl-api.loadbalancer.server.port=${INTERNAL_PORT:-3002}"
logging:
driver: "json-file"
options:
@@ -132,7 +214,7 @@ services:
restart: unless-stopped
# ============================================================
- # Redis — queues, rate limiting, caching
+ # Redis — rate limiting, cache
# ============================================================
redis:
image: redis:alpine
@@ -161,5 +243,7 @@ networks:
external: true
volumes:
+ postgres-data:
+ rabbitmq-data:
redis-data:
searxng-cache:
diff --git a/settings.yml b/searxng/settings.yml
index ac380be..ac380be 100644
--- a/settings.yml
+++ b/searxng/settings.yml