diff options
| author | Adam Malczewski <[email protected]> | 2026-04-27 23:54:19 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-04-27 23:54:19 +0900 |
| commit | 60680e0419f96a628f9eccaf9c53d6749d0a20ca (patch) | |
| tree | 55114197ec14bc419f88e1383d29294952960261 | |
| parent | c7d5395ddc4f818d1faf0c59bd7c87d4ffd67a12 (diff) | |
| download | firecrawl-dokploy-60680e0419f96a628f9eccaf9c53d6749d0a20ca.tar.gz firecrawl-dokploy-60680e0419f96a628f9eccaf9c53d6749d0a20ca.zip | |
working local deploy
| -rw-r--r-- | .gitignore | 1 | ||||
| -rwxr-xr-x | bin/clean | 27 | ||||
| -rwxr-xr-x | bin/dev_secrets | 30 | ||||
| -rwxr-xr-x | bin/down | 4 | ||||
| -rwxr-xr-x | bin/prod_secrets | 35 | ||||
| -rwxr-xr-x | bin/test | 218 | ||||
| -rwxr-xr-x | bin/up | 24 | ||||
| -rw-r--r-- | docker-compose.yml | 110 | ||||
| -rw-r--r-- | searxng/settings.yml (renamed from settings.yml) | 0 |
9 files changed, 436 insertions, 13 deletions
@@ -1 +1,2 @@ reference/ +tmp/ diff --git a/bin/clean b/bin/clean new file mode 100755 index 0000000..a87e386 --- /dev/null +++ b/bin/clean @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Stop and remove containers, volumes, and orphaned containers. +# Pass --images to also remove pulled images (forces a fresh re-pull on next start). + +REMOVE_IMAGES=false +for arg in "$@"; do + [[ "$arg" == "--images" ]] && REMOVE_IMAGES=true +done + +sudo docker compose down --volumes --remove-orphans +sudo docker compose rm -f + +if [ "$REMOVE_IMAGES" = "true" ]; then + echo "Removing cached images..." + sudo docker image rm \ + ghcr.io/firecrawl/firecrawl:latest \ + ghcr.io/firecrawl/playwright-service:latest \ + docker.io/searxng/searxng:latest \ + postgres:17-alpine \ + redis:alpine 2>/dev/null || true + echo "Images removed." +fi + +echo "Done. Run bin/up to start fresh." + diff --git a/bin/dev_secrets b/bin/dev_secrets new file mode 100755 index 0000000..ebc9bb0 --- /dev/null +++ b/bin/dev_secrets @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +export GPG_TTY=$(tty) + +function ensure_secret() { + local path=$1 + local description=$2 + local generate=${3:-false} + + if ! gopass show "$path" >/dev/null 2>&1; then + echo "Missing secret: $description ($path)" + if [ "$generate" = "true" ]; then + local val=$(openssl rand -hex 32) + gopass insert -f "$path" <<< "$val" + echo "Generated and stored." + else + echo "Please enter the value for $description:" + read -rs val + gopass insert -f "$path" <<< "$val" + fi + fi +} + +ensure_secret "projects/firecrawl-dokploy/dev/api_key" "Firecrawl API Key" true +ensure_secret "projects/firecrawl-dokploy/dev/bull_auth_key" "Bull Auth Key" true +ensure_secret "projects/firecrawl-dokploy/dev/postgres_password" "PostgreSQL Password" true +ensure_secret "projects/firecrawl-dokploy/dev/openai_api_key" "OpenAI API Key (optional, press enter to skip)" false + +echo "Dev secrets ensured." diff --git a/bin/down b/bin/down new file mode 100755 index 0000000..38d67fc --- /dev/null +++ b/bin/down @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +set -euo pipefail + +sudo docker compose down "$@" diff --git a/bin/prod_secrets b/bin/prod_secrets new file mode 100755 index 0000000..ca9f7f1 --- /dev/null +++ b/bin/prod_secrets @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +export GPG_TTY=$(tty) + +function get_or_gen_secret() { + local path=$1 + local generate=${2:-false} + local description=$3 + + if ! gopass show "$path" >/dev/null 2>&1; then + if [ "$generate" = "true" ]; then + local val=$(openssl rand -hex 32) + gopass insert -f "$path" <<< "$val" + else + echo >&2 "Missing required secret: $description ($path)" + echo >&2 "Please enter the value:" + read -rs val + gopass insert -f "$path" <<< "$val" + fi + fi + gopass show -o "$path" +} + +TEST_API_KEY=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/api_key" true "Firecrawl API Key") +BULL_AUTH_KEY=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/bull_auth_key" true "Bull Auth Key") +POSTGRES_PASSWORD=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/postgres_password" true "PostgreSQL Password") +OPENAI_API_KEY=$(get_or_gen_secret "projects/firecrawl-dokploy/prod/openai_api_key" false "OpenAI API Key") + +cat <<EOF +TEST_API_KEY=$TEST_API_KEY +BULL_AUTH_KEY=$BULL_AUTH_KEY +POSTGRES_PASSWORD=$POSTGRES_PASSWORD +OPENAI_API_KEY=$OPENAI_API_KEY +EOF diff --git a/bin/test b/bin/test new file mode 100755 index 0000000..c7e9db8 --- /dev/null +++ b/bin/test @@ -0,0 +1,218 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Force GPG to use terminal-based pinentry (required for SSH sessions) +export GPG_TTY=$(tty) + +# ---------------------------------------------------------------------------- +# bin/test — verify a running Firecrawl deployment by hitting it from the host +# (i.e. outside the Docker network), simulating an external client +# such as an AI agent. +# +# Usage: +# bin/test # local dev — hits http://127.0.0.1:3002 +# bin/test https://my.host # remote — hits the given base URL +# +# The API key is read from gopass at projects/firecrawl-dokploy/dev/api_key +# (override by exporting TEST_API_KEY before running). +# ---------------------------------------------------------------------------- + +BASE_URL="${1:-http://127.0.0.1:3002}" +BASE_URL="${BASE_URL%/}" + +if [ -z "${TEST_API_KEY:-}" ]; then + TEST_API_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/api_key)" +fi + +if [ -z "${TEST_API_KEY:-}" ]; then + echo >&2 "ERROR: TEST_API_KEY is empty (gopass returned nothing and env was unset)" + exit 1 +fi + +# ---------------------------------------------------------------------------- +# Output capture: tee everything to tmp/test-<timestamp>.log AND the most recent +# run to tmp/test-latest.log, while still printing to the terminal. +# ---------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$REPO_ROOT/tmp" +mkdir -p "$TMP_DIR" +LOG_FILE="$TMP_DIR/test-$(date +%Y%m%d-%H%M%S).log" +LATEST_LOG="$TMP_DIR/test-latest.log" + +# Redirect all stdout+stderr through tee. Use a process substitution so the +# script keeps its normal exit code. +exec > >(tee "$LOG_FILE") 2>&1 + +# Mirror to test-latest.log on exit (atomic copy, not a symlink so it survives +# `rm tmp/*.log`). +trap 'cp -f "$LOG_FILE" "$LATEST_LOG" 2>/dev/null || true' EXIT + +# ---------------------------------------------------------------------------- +# Pretty output helpers +# ---------------------------------------------------------------------------- +RED=$'\033[0;31m' +GREEN=$'\033[0;32m' +YELLOW=$'\033[0;33m' +BLUE=$'\033[0;34m' +BOLD=$'\033[1m' +RESET=$'\033[0m' + +PASS=0 +FAIL=0 + +section() { + printf '\n%s== %s ==%s\n' "$BOLD$BLUE" "$1" "$RESET" +} + +ok() { + printf ' %s✓%s %s\n' "$GREEN" "$RESET" "$1" + PASS=$((PASS + 1)) +} + +bad() { + printf ' %s✗%s %s\n' "$RED" "$RESET" "$1" + FAIL=$((FAIL + 1)) +} + +info() { + printf ' %s…%s %s\n' "$YELLOW" "$RESET" "$1" +} + +# ---------------------------------------------------------------------------- +# Issue a JSON request, capture body + status +# Args: METHOD PATH [JSON_BODY] +# Sets: HTTP_STATUS, HTTP_BODY +# ---------------------------------------------------------------------------- +http_call() { + local method=$1 + local path=$2 + local body=${3:-} + + local tmp + tmp=$(mktemp) + + local args=( + -sS + -o "$tmp" + -w '%{http_code}' + -X "$method" + -H "Authorization: Bearer ${TEST_API_KEY}" + -H "Content-Type: application/json" + --connect-timeout 10 + --max-time 120 + ) + + if [ -n "$body" ]; then + args+=(-d "$body") + fi + + HTTP_STATUS=$(curl "${args[@]}" "${BASE_URL}${path}" || echo "000") + HTTP_BODY=$(cat "$tmp") + rm -f "$tmp" +} + +# ---------------------------------------------------------------------------- +# Test runner +# ---------------------------------------------------------------------------- +section "Target" +info "BASE_URL = ${BASE_URL}" +info "TEST_API_KEY = ${TEST_API_KEY:0:8}…" + +# 1. Reachability ------------------------------------------------------------ +section "1. Reachability" +http_call GET "/" +case "$HTTP_STATUS" in + 200|404|401) + ok "API is reachable (HTTP $HTTP_STATUS at /)" + ;; + 000) + bad "Could not connect to ${BASE_URL} — is the stack running?" + echo + echo "Hint: run 'bin/up' first, or pass a different base URL." + exit 1 + ;; + *) + bad "Unexpected HTTP $HTTP_STATUS at /" + ;; +esac + +# 2. Auth posture (informational) ------------------------------------------ +# Self-hosted Firecrawl with USE_DB_AUTHENTICATION=false has no built-in +# bearer-token gate — any token (including a bogus one) is accepted by the +# API. We probe with a bogus token just to surface this fact in the log. +section "2. Auth posture (informational)" +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST "${BASE_URL}/v1/scrape" \ + -H "Authorization: Bearer fc-definitely-not-a-real-key" \ + -H "Content-Type: application/json" \ + -d '{"url":"https://example.com"}' \ + --connect-timeout 10 --max-time 60 || echo "000") +rm -f "$tmp" +if [ "$status" = "401" ] || [ "$status" = "403" ]; then + ok "bogus key rejected with HTTP $status (USE_DB_AUTHENTICATION on?)" +else + info "bogus key returned HTTP $status — self-hosted Firecrawl is open by design; restrict access at Traefik/firewall level" +fi + +# 3. /v1/scrape -------------------------------------------------------------- +section "3. Scrape — POST /v1/scrape https://example.com" +http_call POST "/v1/scrape" '{"url":"https://example.com","formats":["markdown"]}' +if [ "$HTTP_STATUS" = "200" ]; then + if printf '%s' "$HTTP_BODY" | grep -qi "example domain"; then + ok "scrape returned 200 and markdown contains 'Example Domain'" + else + bad "scrape returned 200 but markdown did not contain 'Example Domain'" + echo "$HTTP_BODY" | head -c 400 + echo + fi +else + bad "scrape failed with HTTP $HTTP_STATUS" + echo "$HTTP_BODY" | head -c 400 + echo +fi + +# 4. /v1/search (covers SearXNG + Firecrawl scrape pipeline) ----------------- +section "4. Search — POST /v1/search 'firecrawl github'" +http_call POST "/v1/search" '{"query":"firecrawl github","limit":3}' +if [ "$HTTP_STATUS" = "200" ]; then + if printf '%s' "$HTTP_BODY" | grep -q '"success":true'; then + ok "search returned 200 with success:true (SearXNG + scrape pipeline OK)" + else + bad "search returned 200 but body lacks success:true" + echo "$HTTP_BODY" | head -c 400 + echo + fi +else + bad "search failed with HTTP $HTTP_STATUS" + echo "$HTTP_BODY" | head -c 400 + echo +fi + +# 5. /v1/map ----------------------------------------------------------------- +section "5. Map — POST /v1/map https://example.com" +http_call POST "/v1/map" '{"url":"https://example.com"}' +if [ "$HTTP_STATUS" = "200" ]; then + ok "map returned 200" +else + bad "map failed with HTTP $HTTP_STATUS" + echo "$HTTP_BODY" | head -c 400 + echo +fi + +# ---------------------------------------------------------------------------- +# Summary +# ---------------------------------------------------------------------------- +echo +section "Summary" +printf " %sPassed:%s %d\n" "$GREEN" "$RESET" "$PASS" +printf " %sFailed:%s %d\n" "$RED" "$RESET" "$FAIL" +echo + +if [ "$FAIL" -gt 0 ]; then + echo "Log: $LOG_FILE" + exit 1 +fi + +echo "Log: $LOG_FILE" @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Force GPG to use terminal-based pinentry +export GPG_TTY=$(tty) + +# Ensure dokploy-network exists for local dev if not present +if ! sudo docker network inspect dokploy-network >/dev/null 2>&1; then + sudo docker network create dokploy-network +fi + +# Load secrets from gopass +export TEST_API_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/api_key)" +export BULL_AUTH_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/bull_auth_key)" +export POSTGRES_PASSWORD="$(gopass show -o projects/firecrawl-dokploy/dev/postgres_password)" +export OPENAI_API_KEY="$(gopass show -o projects/firecrawl-dokploy/dev/openai_api_key || echo "")" +export FIRECRAWL_DOMAIN="firecrawl.localhost" + +sudo TEST_API_KEY="$TEST_API_KEY" \ + BULL_AUTH_KEY="$BULL_AUTH_KEY" \ + POSTGRES_PASSWORD="$POSTGRES_PASSWORD" \ + OPENAI_API_KEY="$OPENAI_API_KEY" \ + FIRECRAWL_DOMAIN="$FIRECRAWL_DOMAIN" \ + docker compose up "$@" diff --git a/docker-compose.yml b/docker-compose.yml index a59d779..44c0f05 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,6 +2,65 @@ name: firecrawl services: # ============================================================ + # PostgreSQL — Firecrawl's NUQ queue store + # Uses the firecrawl-published image, which extends postgres:17 + # with pg_cron preloaded and the nuq schema bootstrapped via + # /docker-entrypoint-initdb.d/010-nuq.sql. + # + # NOTE: pg_cron is pinned to database 'postgres' in the image + # (cron.database_name = 'postgres'), so POSTGRES_DB MUST be + # 'postgres'. The init script creates the nuq schema in that + # database. Do not change POSTGRES_DB / POSTGRES_USER here + # unless you also rebuild the nuq-postgres image to match. + # ============================================================ + postgres: + image: ghcr.io/firecrawl/nuq-postgres:latest + networks: + - backend + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres} + POSTGRES_DB: postgres + volumes: + - postgres-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d postgres"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + logging: + driver: "json-file" + options: + max-size: "5m" + max-file: "2" + restart: unless-stopped + + # ============================================================ + # RabbitMQ — required by Firecrawl's NUQ workers + # ============================================================ + rabbitmq: + image: rabbitmq:3-management + networks: + - backend + command: rabbitmq-server + volumes: + - rabbitmq-data:/var/lib/rabbitmq + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "-q", "check_running"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + logging: + driver: "json-file" + options: + max-size: "5m" + max-file: "2" + compress: "true" + restart: unless-stopped + + # ============================================================ # SearXNG — metasearch engine (powers Firecrawl's /search API) # ============================================================ searxng: @@ -10,7 +69,7 @@ services: - backend - dokploy-network volumes: - - ./searxng:/etc/searxng:rw + - ./searxng/settings.yml:/etc/searxng/settings.yml:ro - searxng-cache:/var/cache/searxng:rw environment: - SEARXNG_BASE_URL=https://${SEARXNG_DOMAIN:-searxng.localhost}/ @@ -31,7 +90,7 @@ services: # Otherwise it's only reachable internally by Firecrawl. # labels: # - "traefik.enable=true" - # - "traefik.http.routers.searxng.rule=Host(`${SEARXNG_DOMAIN}`)" + # - "traefik.http.routers.searxng.rule=Host(`${SEARXNG_DOMAIN:-searxng.localhost}`)" # - "traefik.http.routers.searxng.entrypoints=websecure" # - "traefik.http.routers.searxng.tls.certResolver=letsencrypt" # - "traefik.http.services.searxng.loadbalancer.server.port=8080" @@ -69,39 +128,56 @@ services: # Firecrawl API — scrape, crawl, search, map # ============================================================ api: - image: ghcr.io/firecrawl/firecrawl + image: ghcr.io/firecrawl/firecrawl:latest networks: - backend - dokploy-network extra_hosts: - "host.docker.internal:host-gateway" environment: - # === Required === - PORT: ${PORT:-3002} - INTERNAL_PORT: ${INTERNAL_PORT:-3002} + # === Server === HOST: 0.0.0.0 + PORT: ${INTERNAL_PORT:-3002} + INTERNAL_PORT: ${INTERNAL_PORT:-3002} + ENV: local NUM_WORKERS_PER_QUEUE: ${NUM_WORKERS_PER_QUEUE:-8} + USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-false} + # === Redis === REDIS_URL: redis://redis:6379 REDIS_RATE_LIMIT_URL: redis://redis:6379 + # === Playwright === PLAYWRIGHT_MICROSERVICE_URL: http://playwright-service:3000/scrape - USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-false} - # === SearXNG (internal, same compose network) === + # === SearXNG (internal) === SEARXNG_ENDPOINT: http://searxng:8080 SEARXNG_ENGINES: ${SEARXNG_ENGINES:-} SEARXNG_CATEGORIES: ${SEARXNG_CATEGORIES:-} - # === Optional: Auth === + # === NUQ Postgres === + # POSTGRES_HOST != "localhost" puts the harness into docker-compose + # mode and prevents it from trying to spawn its own container. + POSTGRES_HOST: postgres + POSTGRES_PORT: "5432" + POSTGRES_USER: postgres + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres} + POSTGRES_DB: postgres + # === NUQ RabbitMQ === + # Must be set explicitly — the harness's docker-compose branch + # does NOT auto-populate this, it just skips container management. + NUQ_RABBITMQ_URL: amqp://rabbitmq:5672 + # === Auth / Secrets === + BULL_AUTH_KEY: ${BULL_AUTH_KEY:-} TEST_API_KEY: ${TEST_API_KEY:-} - BULL_AUTH_KEY: ${BULL_AUTH_KEY:-CHANGEME} # === Optional: AI Features === OPENAI_API_KEY: ${OPENAI_API_KEY:-} + OPENAI_BASE_URL: ${OPENAI_BASE_URL:-} OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-} MODEL_NAME: ${MODEL_NAME:-} + MODEL_EMBEDDING_NAME: ${MODEL_EMBEDDING_NAME:-} # === Optional: Proxy === PROXY_SERVER: ${PROXY_SERVER:-} PROXY_USERNAME: ${PROXY_USERNAME:-} PROXY_PASSWORD: ${PROXY_PASSWORD:-} ports: - - ${PORT:-3002} + - "127.0.0.1:${INTERNAL_PORT:-3002}:${INTERNAL_PORT:-3002}" ulimits: nofile: soft: 65535 @@ -109,10 +185,16 @@ services: cpus: 4.0 mem_limit: 8G memswap_limit: 8G + # --start-docker = run pre-built dist/ directly, skip pnpm install + # and skip the container-management code paths in harness.ts. command: node dist/src/harness.js --start-docker depends_on: redis: condition: service_healthy + postgres: + condition: service_healthy + rabbitmq: + condition: service_healthy playwright-service: condition: service_started searxng: @@ -122,7 +204,7 @@ services: - "traefik.http.routers.firecrawl-api.rule=Host(`${FIRECRAWL_DOMAIN}`)" - "traefik.http.routers.firecrawl-api.entrypoints=websecure" - "traefik.http.routers.firecrawl-api.tls.certResolver=letsencrypt" - - "traefik.http.services.firecrawl-api.loadbalancer.server.port=${PORT:-3002}" + - "traefik.http.services.firecrawl-api.loadbalancer.server.port=${INTERNAL_PORT:-3002}" logging: driver: "json-file" options: @@ -132,7 +214,7 @@ services: restart: unless-stopped # ============================================================ - # Redis — queues, rate limiting, caching + # Redis — rate limiting, cache # ============================================================ redis: image: redis:alpine @@ -161,5 +243,7 @@ networks: external: true volumes: + postgres-data: + rabbitmq-data: redis-data: searxng-cache: diff --git a/settings.yml b/searxng/settings.yml index ac380be..ac380be 100644 --- a/settings.yml +++ b/searxng/settings.yml |
