Remove everything that's not needed for PLC mirror
parent
56727bbe11
commit
9b877c1524
72
Makefile
72
Makefile
|
@ -1,6 +1,4 @@
|
||||||
.PHONY: all build up update down start-db status logs psql init-db start-plc wait-for-plc
|
.PHONY: all build up update down start-db status logs
|
||||||
|
|
||||||
# ---------------------------- Docker ----------------------------
|
|
||||||
|
|
||||||
all:
|
all:
|
||||||
go test -v ./...
|
go test -v ./...
|
||||||
|
@ -29,71 +27,3 @@ status:
|
||||||
|
|
||||||
logs:
|
logs:
|
||||||
@docker compose logs -f -n 50
|
@docker compose logs -f -n 50
|
||||||
|
|
||||||
start-plc: .env
|
|
||||||
@docker compose up -d --build postgres plc
|
|
||||||
|
|
||||||
wait-for-plc:
|
|
||||||
@. ./.env && while ! curl -s --fail-with-body http://$${METRICS_ADDR:-localhost}:11004/ready; do sleep 10; done
|
|
||||||
|
|
||||||
# ---------------------------- Docker ----------------------------
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------- Database ----------------------------
|
|
||||||
|
|
||||||
psql:
|
|
||||||
@docker compose up -d postgres
|
|
||||||
@docker compose exec -it postgres psql -U postgres -d bluesky
|
|
||||||
|
|
||||||
init-db: .env
|
|
||||||
@docker compose up -d --build lister
|
|
||||||
@sleep 10
|
|
||||||
@docker compose stop lister
|
|
||||||
@cat ./db-migration/init.sql | docker exec -i "$$(docker compose ps --format '{{.Names}}' postgres)" psql -U postgres -d bluesky
|
|
||||||
|
|
||||||
sqltop:
|
|
||||||
watch -n 1 'cat top.sql|docker compose exec -i postgres psql -U postgres -d bluesky'
|
|
||||||
|
|
||||||
sqldu:
|
|
||||||
cat du.sql | docker compose exec -iT postgres psql -U postgres -d bluesky
|
|
||||||
|
|
||||||
# ---------------------------- Database ----------------------------
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------- CSV Export ----------------------------
|
|
||||||
|
|
||||||
# NOT RECOMMENDED TO RUN for the firts time on hot live db, will chomp all available IO. stop services first
|
|
||||||
csv-export:
|
|
||||||
@docker compose up -d postgres
|
|
||||||
@sleep 10
|
|
||||||
@nohup ./csv_export.sh > csv_export.out &
|
|
||||||
|
|
||||||
csv-iexport:
|
|
||||||
@docker compose up -d postgres
|
|
||||||
@sleep 10
|
|
||||||
@nohup ./csv_iexport.sh > csv_iexport.out &
|
|
||||||
|
|
||||||
csv-iexport-month:
|
|
||||||
@docker compose up -d postgres
|
|
||||||
@sleep 10
|
|
||||||
@nohup ./csv_iexport_month.sh > csv_iexport_month.out &
|
|
||||||
|
|
||||||
kill-csv-export:
|
|
||||||
@kill -9 `pgrep csv_export.sh`
|
|
||||||
|
|
||||||
kill-csv-iexport:
|
|
||||||
@kill -9 `pgrep csv_iexport.sh`
|
|
||||||
|
|
||||||
kill-csv-iexport-month:
|
|
||||||
@kill -9 `pgrep csv_iexport_month.sh`
|
|
||||||
|
|
||||||
# ---------------------------- CSV Export ----------------------------
|
|
||||||
|
|
||||||
|
|
||||||
dash-export:
|
|
||||||
@./dashboards/export.sh
|
|
||||||
|
|
||||||
dash-import:
|
|
||||||
@./dashboards/update.sh
|
|
||||||
|
|
106
README.md
106
README.md
|
@ -1,52 +1,7 @@
|
||||||
# Bluesky indexer
|
# PLC mirror
|
||||||
|
|
||||||
This is a bunch of code that can download all of Bluesky into a giant table in
|
Syncs PLC operations log into a local table, and allows resolving `did:plc:`
|
||||||
PostgreSQL.
|
DIDs without putting strain on https://plc.directory and hitting rate limits.
|
||||||
|
|
||||||
The structure of that table is roughly `(repo, collection, rkey) -> JSON`, and
|
|
||||||
it is a good idea to partition it by collection.
|
|
||||||
|
|
||||||
## System requirements
|
|
||||||
|
|
||||||
NOTE: all of this is valid as of April 2024, when Bluesky has ~5.5M accounts,
|
|
||||||
~1.2B records total, and average daily peak of ~100 commits/s.
|
|
||||||
|
|
||||||
* One decent SATA SSD is plenty fast to keep up. Preferably a dedicated one
|
|
||||||
(definitely not the same that your system is installed on). There will be a
|
|
||||||
lot of writes happening, so the total durability of the disk will be used up
|
|
||||||
at non-negligible rate.
|
|
||||||
* 16GB of RAM, but the more the better, obviously.
|
|
||||||
* ZFS with compression enabled is highly recommended, but not strictly
|
|
||||||
necessary.
|
|
||||||
* Compression will cut down on IO bandwidth quite a bit, as well as on used
|
|
||||||
disk space. On a compressed FS the whole database takes up about 270GB,
|
|
||||||
without compression - almost 3 times as much.
|
|
||||||
|
|
||||||
## Overview of components
|
|
||||||
|
|
||||||
### Lister
|
|
||||||
|
|
||||||
Once a day get a list of all repos from all known PDSs and adds any that are
|
|
||||||
missing to the database.
|
|
||||||
|
|
||||||
### Consumer
|
|
||||||
|
|
||||||
Connects to firehose of each PDS and stores all received records in the
|
|
||||||
database.
|
|
||||||
|
|
||||||
If `CONSUMER_RELAYS` is specified, it will also add any new PDSs to the database
|
|
||||||
that have records sent through a relay.
|
|
||||||
|
|
||||||
### Record indexer
|
|
||||||
|
|
||||||
Goes over all repos that might have missing data, gets a full checkout from the
|
|
||||||
PDS and adds all missing records to the database.
|
|
||||||
|
|
||||||
### PLC mirror
|
|
||||||
|
|
||||||
Syncs PLC operations log into a local table, and allows other components to
|
|
||||||
resolve `did:plc:` DIDs without putting strain on https://plc.directory and
|
|
||||||
hitting rate limits.
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
|
@ -54,57 +9,12 @@ hitting rate limits.
|
||||||
* Copy `example.env` to `.env` and edit it to your liking.
|
* Copy `example.env` to `.env` and edit it to your liking.
|
||||||
* `POSTGRES_PASSWORD` can be anything, it will be used on the first start of
|
* `POSTGRES_PASSWORD` can be anything, it will be used on the first start of
|
||||||
`postgres` container to initialize the database.
|
`postgres` container to initialize the database.
|
||||||
* Optional: copy `docker-compose.override.yml.example` to
|
|
||||||
`docker-compose.override.yml` to change some parts of `docker-compose.yml`
|
|
||||||
without actually editing it (and introducing possibility of merge conflicts
|
|
||||||
later on).
|
|
||||||
* `make start-plc`
|
|
||||||
* This will start PostgreSQL and PLC mirror
|
|
||||||
* `make wait-for-plc`
|
|
||||||
* This will wait until PLC mirror has fully replicated the operations log.
|
|
||||||
That's gonna take a few hours.
|
|
||||||
* Technically you can start everything before it is caught up: it will
|
|
||||||
return errors and other components will fallback to querying
|
|
||||||
https://plc.directory. But you will be rate-limited quite hard.
|
|
||||||
* `make init-db`
|
|
||||||
* This will add the initial set of PDS hosts into the database.
|
|
||||||
* You can skip this if you're specifying `CONSUMER_RELAYS` in
|
|
||||||
`docker-compose.override.yml`
|
|
||||||
* `make up`
|
* `make up`
|
||||||
|
|
||||||
## Additional commands
|
## Usage
|
||||||
|
|
||||||
* `make status` - will show container status and resource usage
|
You can directly replace `https://plc.directory` with a URL to the exposed port
|
||||||
* `make psql` - starts up SQL shell inside the `postgres` container
|
(11004 by default).
|
||||||
* `make logs` - streams container logs into your terminal
|
|
||||||
* `make sqltop` - will show you currently running queries
|
|
||||||
* `make sqldu` - will show disk space usage for each table and index
|
|
||||||
|
|
||||||
## Tweaking the number of indexer threads at runtime
|
Note that on the first run it will take quite a few hours to download everything,
|
||||||
|
and the mirror with respond with 500 if it's not caught up yet.
|
||||||
Record indexer exposes a simple HTTP handler that allows to do this:
|
|
||||||
|
|
||||||
`curl -s 'http://localhost:11003/pool/resize?size=10'`
|
|
||||||
|
|
||||||
## Advanced topics
|
|
||||||
|
|
||||||
### Table partitioning
|
|
||||||
|
|
||||||
With partitioning by collection you can have separate indexes for each record
|
|
||||||
type. Also, doing any kind of heavy processing on a particular record type will
|
|
||||||
be also faster, because all of these records will be in a separate table and
|
|
||||||
PostgreSQL will just read them sequentially, instead of checking `collection`
|
|
||||||
column for each row.
|
|
||||||
|
|
||||||
You can do the partitioning at any point, but the more data you already have in
|
|
||||||
the database, the longer will it take.
|
|
||||||
|
|
||||||
Before doing this you need to run `lister` at least once in order to create the
|
|
||||||
tables (`make init-db` does this for you as well).
|
|
||||||
|
|
||||||
* Stop all containers except for `postgres`.
|
|
||||||
* Run the [SQL script](db-migration/migrations/20240217_partition.sql) in
|
|
||||||
`psql`.
|
|
||||||
* Check [`migrations`](db-migration/migrations/) dir for any additional
|
|
||||||
migrations you might be interested in.
|
|
||||||
* Once all is done, start the other containers again.
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
../.env
|
|
|
@ -1,6 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
docker compose exec -i postgres pg_dump -U postgres -d bluesky -t records -t records_id_seq --schema-only | sed -E -e 's/PARTITION BY.*/;/' > records.sql
|
|
||||||
docker compose exec -i postgres pg_dump -U postgres -d bluesky --table-and-children records --load-via-partition-root --data-only | lz4 > records.sql.lz4
|
|
|
@ -1,14 +0,0 @@
|
||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
|
||||||
postgres:
|
|
||||||
image: "postgres:16"
|
|
||||||
volumes:
|
|
||||||
- "${DATA_DIR:?specify data dir in .env file}/benchmark:/var/lib/postgresql/data:rw"
|
|
||||||
restart: always
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
environment:
|
|
||||||
POSTGRES_DB: bluesky
|
|
||||||
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD:?specify password in .env file}"
|
|
||||||
command: ["-c", "max_connections=1000"]
|
|
|
@ -1,25 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
output="psql_$(date '+%y%m%d_%H%M%S').log"
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
docker compose stop postgres
|
|
||||||
|
|
||||||
. ./.env
|
|
||||||
sudo rm -rf ${DATA_DIR:?DATA_DIR not set}/benchmark
|
|
||||||
|
|
||||||
echo "$(date): Starting data import..."
|
|
||||||
|
|
||||||
docker compose up -d postgres
|
|
||||||
|
|
||||||
while ! docker compose exec postgres psql -U postgres -d bluesky -c 'select 1;'; do sleep 1; done
|
|
||||||
|
|
||||||
cat ../records.sql | docker compose exec -iT postgres psql -U postgres -d bluesky
|
|
||||||
lz4cat ../records.sql.lz4 | docker compose exec -iT postgres psql -U postgres -d bluesky
|
|
||||||
|
|
||||||
echo "$(date): Data import done"
|
|
||||||
|
|
||||||
cat ../db-migration/migrations/20240217_partition.sql \
|
|
||||||
| docker compose exec -iT postgres psql -U postgres -d bluesky --echo-queries -c '\timing' \
|
|
||||||
| tee -a "${output}"
|
|
|
@ -1,14 +0,0 @@
|
||||||
FROM golang:1.22.3 as builder
|
|
||||||
WORKDIR /app
|
|
||||||
COPY go.mod go.sum ./
|
|
||||||
RUN go mod download
|
|
||||||
COPY . ./
|
|
||||||
RUN go build -trimpath ./cmd/consumer
|
|
||||||
|
|
||||||
FROM alpine:latest as certs
|
|
||||||
RUN apk --update add ca-certificates
|
|
||||||
|
|
||||||
FROM debian:stable-slim
|
|
||||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
|
||||||
COPY --from=builder /app/consumer .
|
|
||||||
ENTRYPOINT ["./consumer"]
|
|
|
@ -1,7 +0,0 @@
|
||||||
*
|
|
||||||
**/*
|
|
||||||
!go.mod
|
|
||||||
!go.sum
|
|
||||||
!**/*.go
|
|
||||||
cmd/**
|
|
||||||
!cmd/consumer
|
|
|
@ -1,622 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"math"
|
|
||||||
"net/http"
|
|
||||||
"net/url"
|
|
||||||
"path"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/cenkalti/backoff/v4"
|
|
||||||
"github.com/gorilla/websocket"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
"gorm.io/gorm/clause"
|
|
||||||
|
|
||||||
comatproto "github.com/bluesky-social/indigo/api/atproto"
|
|
||||||
"github.com/bluesky-social/indigo/xrpc"
|
|
||||||
"github.com/ipld/go-ipld-prime/codec/dagcbor"
|
|
||||||
"github.com/ipld/go-ipld-prime/datamodel"
|
|
||||||
"github.com/ipld/go-ipld-prime/node/basicnode"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/models"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/repo"
|
|
||||||
"github.com/uabluerail/indexer/util/fix"
|
|
||||||
"github.com/uabluerail/indexer/util/resolver"
|
|
||||||
)
|
|
||||||
|
|
||||||
const lastRevUpdateInterval = 24 * time.Hour
|
|
||||||
|
|
||||||
type BadRecord struct {
|
|
||||||
ID models.ID `gorm:"primarykey"`
|
|
||||||
CreatedAt time.Time
|
|
||||||
PDS models.ID `gorm:"index"`
|
|
||||||
Cursor int64
|
|
||||||
Error string
|
|
||||||
Content []byte
|
|
||||||
}
|
|
||||||
|
|
||||||
type Consumer struct {
|
|
||||||
db *gorm.DB
|
|
||||||
remote pds.PDS
|
|
||||||
running chan struct{}
|
|
||||||
|
|
||||||
lastCursorPersist time.Time
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewConsumer(ctx context.Context, remote *pds.PDS, db *gorm.DB) (*Consumer, error) {
|
|
||||||
if err := db.AutoMigrate(&BadRecord{}); err != nil {
|
|
||||||
return nil, fmt.Errorf("db.AutoMigrate: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &Consumer{
|
|
||||||
db: db,
|
|
||||||
remote: *remote,
|
|
||||||
running: make(chan struct{}),
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) Start(ctx context.Context) error {
|
|
||||||
go c.run(ctx)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) Wait(ctx context.Context) error {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-c.running:
|
|
||||||
// Channel got closed
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) run(ctx context.Context) {
|
|
||||||
log := zerolog.Ctx(ctx).With().Str("pds", c.remote.Host).Logger()
|
|
||||||
ctx = log.WithContext(ctx)
|
|
||||||
|
|
||||||
backoffTimer := backoff.NewExponentialBackOff(
|
|
||||||
backoff.WithMaxElapsedTime(0),
|
|
||||||
backoff.WithInitialInterval(time.Second),
|
|
||||||
backoff.WithMaxInterval(5*time.Minute),
|
|
||||||
)
|
|
||||||
pdsOnline.WithLabelValues(c.remote.Host).Set(0)
|
|
||||||
|
|
||||||
defer close(c.running)
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-c.running:
|
|
||||||
log.Error().Msgf("Attempt to start previously stopped consumer")
|
|
||||||
return
|
|
||||||
case <-ctx.Done():
|
|
||||||
log.Info().Msgf("Consumer stopped")
|
|
||||||
lastEventTimestamp.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
|
|
||||||
eventCounter.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
|
|
||||||
reposDiscovered.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
|
|
||||||
postsByLanguageIndexed.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
|
|
||||||
pdsOnline.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
start := time.Now()
|
|
||||||
if err := c.runOnce(ctx); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Consumer of %q failed (will be restarted): %s", c.remote.Host, err)
|
|
||||||
connectionFailures.WithLabelValues(c.remote.Host).Inc()
|
|
||||||
}
|
|
||||||
if time.Since(start) > backoffTimer.MaxInterval*3 {
|
|
||||||
// XXX: assume that c.runOnce did some useful work in this case,
|
|
||||||
// even though it might have been stuck on some absurdly long timeouts.
|
|
||||||
backoffTimer.Reset()
|
|
||||||
}
|
|
||||||
time.Sleep(backoffTimer.NextBackOff())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) runOnce(ctx context.Context) error {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
|
|
||||||
log.Info().
|
|
||||||
Int64("cursor", c.remote.Cursor).
|
|
||||||
Int64("first_cursor_since_reset", c.remote.FirstCursorSinceReset).
|
|
||||||
Msgf("Connecting to firehose of %s...", c.remote.Host)
|
|
||||||
|
|
||||||
addr, err := url.Parse(c.remote.Host)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("parsing URL %q: %s", c.remote.Host, err)
|
|
||||||
}
|
|
||||||
addr.Scheme = "wss"
|
|
||||||
addr.Path = path.Join(addr.Path, "xrpc/com.atproto.sync.subscribeRepos")
|
|
||||||
|
|
||||||
if c.remote.Cursor > 0 {
|
|
||||||
params := url.Values{"cursor": []string{fmt.Sprint(c.remote.Cursor)}}
|
|
||||||
addr.RawQuery = params.Encode()
|
|
||||||
}
|
|
||||||
|
|
||||||
conn, _, err := websocket.DefaultDialer.DialContext(ctx, addr.String(), http.Header{})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("establishing websocker connection: %w", err)
|
|
||||||
}
|
|
||||||
defer conn.Close()
|
|
||||||
|
|
||||||
pdsOnline.WithLabelValues(c.remote.Host).Set(1)
|
|
||||||
defer func() { pdsOnline.WithLabelValues(c.remote.Host).Set(0) }()
|
|
||||||
|
|
||||||
ch := make(chan bool)
|
|
||||||
defer close(ch)
|
|
||||||
go func() {
|
|
||||||
t := time.NewTicker(time.Minute)
|
|
||||||
defer t.Stop()
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ch:
|
|
||||||
return
|
|
||||||
case <-t.C:
|
|
||||||
if err := conn.WriteControl(websocket.PingMessage, []byte("ping"), time.Now().Add(time.Minute)); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to send ping: %s", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
first := true
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
default:
|
|
||||||
_, b, err := conn.ReadMessage()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("websocket.ReadMessage: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
r := bytes.NewReader(b)
|
|
||||||
proto := basicnode.Prototype.Any
|
|
||||||
headerNode := proto.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true}).Decode(headerNode, r); err != nil {
|
|
||||||
return fmt.Errorf("unmarshaling message header: %w", err)
|
|
||||||
}
|
|
||||||
header, err := parseHeader(headerNode.Build())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("parsing message header: %w", err)
|
|
||||||
}
|
|
||||||
switch header.Op {
|
|
||||||
case 1:
|
|
||||||
if err := c.processMessage(ctx, header.Type, r, first); err != nil {
|
|
||||||
if ctx.Err() != nil {
|
|
||||||
// We're shutting down, so the error is most likely due to that.
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
const maxBadRecords = 500
|
|
||||||
var count int64
|
|
||||||
if err2 := c.db.Model(&BadRecord{}).Where(&BadRecord{PDS: c.remote.ID}).Count(&count).Error; err2 != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if count >= maxBadRecords {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Error().Err(err).Str("pds", c.remote.Host).Msgf("Failed to process message at cursor %d: %s", c.remote.Cursor, err)
|
|
||||||
err := c.db.Create(&BadRecord{
|
|
||||||
PDS: c.remote.ID,
|
|
||||||
Cursor: c.remote.Cursor,
|
|
||||||
Error: err.Error(),
|
|
||||||
Content: b,
|
|
||||||
}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to store bad message: %s", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case -1:
|
|
||||||
bodyNode := proto.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true, AllowLinks: true}).Decode(bodyNode, r); err != nil {
|
|
||||||
return fmt.Errorf("unmarshaling message body: %w", err)
|
|
||||||
}
|
|
||||||
body, err := parseError(bodyNode.Build())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("parsing error payload: %w", err)
|
|
||||||
}
|
|
||||||
return &body
|
|
||||||
default:
|
|
||||||
log.Warn().Msgf("Unknown 'op' value received: %d", header.Op)
|
|
||||||
}
|
|
||||||
first = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) resetCursor(ctx context.Context, seq int64) error {
|
|
||||||
zerolog.Ctx(ctx).Warn().Str("pds", c.remote.Host).Msgf("Cursor reset: %d -> %d", c.remote.Cursor, seq)
|
|
||||||
err := c.db.Model(&c.remote).
|
|
||||||
Where(&pds.PDS{ID: c.remote.ID}).
|
|
||||||
Updates(&pds.PDS{FirstCursorSinceReset: seq}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("updating FirstCursorSinceReset: %w", err)
|
|
||||||
}
|
|
||||||
c.remote.FirstCursorSinceReset = seq
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) updateCursor(ctx context.Context, seq int64) error {
|
|
||||||
if math.Abs(float64(seq-c.remote.Cursor)) < 100 && time.Since(c.lastCursorPersist) < 5*time.Second {
|
|
||||||
c.remote.Cursor = seq
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
err := c.db.Model(&c.remote).
|
|
||||||
Where(&pds.PDS{ID: c.remote.ID}).
|
|
||||||
Updates(&pds.PDS{Cursor: seq}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("updating Cursor: %w", err)
|
|
||||||
}
|
|
||||||
c.remote.Cursor = seq
|
|
||||||
return nil
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Consumer) processMessage(ctx context.Context, typ string, r io.Reader, first bool) error {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
|
|
||||||
eventCounter.WithLabelValues(c.remote.Host, typ).Inc()
|
|
||||||
|
|
||||||
switch typ {
|
|
||||||
case "#commit":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Commit{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
|
|
||||||
|
|
||||||
if c.remote.FirstCursorSinceReset == 0 {
|
|
||||||
if err := c.resetCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return fmt.Errorf("handling cursor reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
repoInfo, created, err := repo.EnsureExists(ctx, c.db, payload.Repo)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("repo.EnsureExists(%q): %w", payload.Repo, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if repoInfo.LastKnownKey == "" {
|
|
||||||
_, pubKey, err := resolver.GetPDSEndpointAndPublicKey(ctx, payload.Repo)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to get DID doc for %q: %w", payload.Repo, err)
|
|
||||||
}
|
|
||||||
repoInfo.LastKnownKey = pubKey
|
|
||||||
err = c.db.Model(repoInfo).Where(&repo.Repo{ID: repoInfo.ID}).Updates(&repo.Repo{LastKnownKey: pubKey}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to update the key for %q: %w", payload.Repo, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if repoInfo.PDS != c.remote.ID {
|
|
||||||
u, _, err := resolver.GetPDSEndpointAndPublicKey(ctx, payload.Repo)
|
|
||||||
if err == nil {
|
|
||||||
cur, err := pds.EnsureExists(ctx, c.db, u.String())
|
|
||||||
if err == nil {
|
|
||||||
if repoInfo.PDS != cur.ID {
|
|
||||||
// Repo was migrated, lets update our record.
|
|
||||||
err := c.db.Model(repoInfo).Where(&repo.Repo{ID: repoInfo.ID}).Updates(&repo.Repo{PDS: cur.ID}).Error
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Repo %q was migrated to %q, but updating the repo has failed: %s", payload.Repo, cur.Host, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
repoInfo.PDS = cur.ID
|
|
||||||
} else {
|
|
||||||
log.Error().Err(err).Msgf("Failed to get PDS record for %q: %s", u, err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.Error().Err(err).Msgf("Failed to get PDS endpoint for repo %q: %s", payload.Repo, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if repoInfo.PDS != c.remote.ID {
|
|
||||||
// We checked a recent version of DID doc and this is still not a correct PDS.
|
|
||||||
log.Error().Str("did", payload.Repo).Str("rev", payload.Rev).
|
|
||||||
Msgf("Commit from an incorrect PDS, skipping")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if created {
|
|
||||||
reposDiscovered.WithLabelValues(c.remote.Host).Inc()
|
|
||||||
}
|
|
||||||
|
|
||||||
expectRecords := false
|
|
||||||
deletions := []string{}
|
|
||||||
for _, op := range payload.Ops {
|
|
||||||
switch op.Action {
|
|
||||||
case "create":
|
|
||||||
expectRecords = true
|
|
||||||
case "update":
|
|
||||||
expectRecords = true
|
|
||||||
case "delete":
|
|
||||||
deletions = append(deletions, op.Path)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, d := range deletions {
|
|
||||||
parts := strings.SplitN(d, "/", 2)
|
|
||||||
if len(parts) != 2 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
err := c.db.Model(&repo.Record{}).
|
|
||||||
Where(&repo.Record{
|
|
||||||
Repo: models.ID(repoInfo.ID),
|
|
||||||
Collection: parts[0],
|
|
||||||
Rkey: parts[1]}).
|
|
||||||
Updates(&repo.Record{Deleted: true}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to mark %s/%s as deleted: %w", payload.Repo, d, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRecs, err := repo.ExtractRecords(ctx, bytes.NewReader(payload.Blocks), repoInfo.LastKnownKey)
|
|
||||||
if errors.Is(err, repo.ErrInvalidSignature) {
|
|
||||||
// Key might have been updated recently.
|
|
||||||
_, pubKey, err2 := resolver.GetPDSEndpointAndPublicKey(ctx, payload.Repo)
|
|
||||||
if err2 != nil {
|
|
||||||
return fmt.Errorf("failed to get DID doc for %q: %w", payload.Repo, err2)
|
|
||||||
}
|
|
||||||
if repoInfo.LastKnownKey != pubKey {
|
|
||||||
repoInfo.LastKnownKey = pubKey
|
|
||||||
err2 = c.db.Model(repoInfo).Where(&repo.Repo{ID: repoInfo.ID}).Updates(&repo.Repo{LastKnownKey: pubKey}).Error
|
|
||||||
if err2 != nil {
|
|
||||||
return fmt.Errorf("failed to update the key for %q: %w", payload.Repo, err2)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Retry with the new key.
|
|
||||||
newRecs, err = repo.ExtractRecords(ctx, bytes.NewReader(payload.Blocks), pubKey)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to extract records: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
recs := []repo.Record{}
|
|
||||||
for k, v := range newRecs {
|
|
||||||
parts := strings.SplitN(k, "/", 2)
|
|
||||||
if len(parts) != 2 {
|
|
||||||
log.Warn().Msgf("Unexpected key format: %q", k)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
langs, _, err := repo.GetLang(ctx, v)
|
|
||||||
if err == nil {
|
|
||||||
for _, lang := range langs {
|
|
||||||
postsByLanguageIndexed.WithLabelValues(c.remote.Host, lang).Inc()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
recs = append(recs, repo.Record{
|
|
||||||
Repo: models.ID(repoInfo.ID),
|
|
||||||
Collection: parts[0],
|
|
||||||
Rkey: parts[1],
|
|
||||||
// XXX: proper replacement of \u0000 would require full parsing of JSON
|
|
||||||
// and recursive iteration over all string values, but this
|
|
||||||
// should work well enough for now.
|
|
||||||
Content: fix.EscapeNullCharForPostgres(v),
|
|
||||||
AtRev: payload.Rev,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if len(recs) == 0 && expectRecords {
|
|
||||||
log.Debug().Int64("seq", payload.Seq).Str("pds", c.remote.Host).Msgf("len(recs) == 0")
|
|
||||||
}
|
|
||||||
if len(recs) > 0 {
|
|
||||||
err = c.db.Model(&repo.Record{}).
|
|
||||||
Clauses(clause.OnConflict{
|
|
||||||
Where: clause.Where{Exprs: []clause.Expression{clause.Or(
|
|
||||||
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: nil},
|
|
||||||
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: ""},
|
|
||||||
clause.Lt{
|
|
||||||
Column: clause.Column{Name: "at_rev", Table: "records"},
|
|
||||||
Value: clause.Column{Name: "at_rev", Table: "excluded"}},
|
|
||||||
)}},
|
|
||||||
DoUpdates: clause.AssignmentColumns([]string{"content", "at_rev"}),
|
|
||||||
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
|
|
||||||
Create(recs).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("inserting records into the database: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if repoInfo.FirstCursorSinceReset > 0 && repoInfo.FirstRevSinceReset != "" &&
|
|
||||||
repoInfo.LastIndexedRev != "" &&
|
|
||||||
c.remote.FirstCursorSinceReset > 0 &&
|
|
||||||
repoInfo.FirstCursorSinceReset >= c.remote.FirstCursorSinceReset &&
|
|
||||||
repoInfo.FirstRevSinceReset <= repoInfo.LastIndexedRev &&
|
|
||||||
time.Since(repoInfo.UpdatedAt) > lastRevUpdateInterval {
|
|
||||||
|
|
||||||
err = c.db.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoInfo.ID}).
|
|
||||||
Updates(&repo.Repo{
|
|
||||||
LastFirehoseRev: payload.Rev,
|
|
||||||
}).Error
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to update last_firehose_rev for %q: %s", repoInfo.DID, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if payload.TooBig {
|
|
||||||
// Just trigger a re-index by resetting rev.
|
|
||||||
err := c.db.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoInfo.ID}).
|
|
||||||
Updates(&repo.Repo{
|
|
||||||
FirstCursorSinceReset: c.remote.FirstCursorSinceReset,
|
|
||||||
FirstRevSinceReset: payload.Rev,
|
|
||||||
}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to update repo info after cursor reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if repoInfo.FirstCursorSinceReset != c.remote.FirstCursorSinceReset {
|
|
||||||
err := c.db.Model(&repo.Repo{}).Debug().Where(&repo.Repo{ID: repoInfo.ID}).
|
|
||||||
Updates(&repo.Repo{
|
|
||||||
FirstCursorSinceReset: c.remote.FirstCursorSinceReset,
|
|
||||||
FirstRevSinceReset: payload.Rev,
|
|
||||||
}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to update repo info after cursor reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := c.updateCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case "#handle":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Handle{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
|
|
||||||
|
|
||||||
if c.remote.FirstCursorSinceReset == 0 {
|
|
||||||
if err := c.resetCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return fmt.Errorf("handling cursor reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// No-op, we don't store handles.
|
|
||||||
if err := c.updateCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case "#migrate":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Migrate{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
|
|
||||||
|
|
||||||
if c.remote.FirstCursorSinceReset == 0 {
|
|
||||||
if err := c.resetCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return fmt.Errorf("handling cursor reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Debug().Interface("payload", payload).Str("did", payload.Did).Msgf("MIGRATION")
|
|
||||||
// TODO
|
|
||||||
if err := c.updateCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case "#tombstone":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Tombstone{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
|
|
||||||
|
|
||||||
if c.remote.FirstCursorSinceReset == 0 {
|
|
||||||
if err := c.resetCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return fmt.Errorf("handling cursor reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// TODO
|
|
||||||
if err := c.updateCursor(ctx, payload.Seq); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case "#info":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Info{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
switch payload.Name {
|
|
||||||
case "OutdatedCursor":
|
|
||||||
if !first {
|
|
||||||
log.Warn().Msgf("Received cursor reset notification in the middle of a stream: %+v", payload)
|
|
||||||
}
|
|
||||||
c.remote.FirstCursorSinceReset = 0
|
|
||||||
default:
|
|
||||||
log.Error().Msgf("Unknown #info message %q: %+v", payload.Name, payload)
|
|
||||||
}
|
|
||||||
case "#identity":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Identity{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
|
|
||||||
log.Trace().Str("did", payload.Did).Str("type", typ).Int64("seq", payload.Seq).
|
|
||||||
Msgf("#identity message: %s seq=%d time=%q", payload.Did, payload.Seq, payload.Time)
|
|
||||||
|
|
||||||
resolver.Resolver.FlushCacheFor(payload.Did)
|
|
||||||
|
|
||||||
// TODO: fetch DID doc and update PDS field?
|
|
||||||
default:
|
|
||||||
b, err := io.ReadAll(r)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to read message payload: %s", err)
|
|
||||||
}
|
|
||||||
log.Warn().Msgf("Unknown message type received: %s payload=%q", typ, string(b))
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type Header struct {
|
|
||||||
Op int64
|
|
||||||
Type string
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseHeader(node datamodel.Node) (Header, error) {
|
|
||||||
r := Header{}
|
|
||||||
op, err := node.LookupByString("op")
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("missing 'op': %w", err)
|
|
||||||
}
|
|
||||||
r.Op, err = op.AsInt()
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("op.AsInt(): %w", err)
|
|
||||||
}
|
|
||||||
if r.Op == -1 {
|
|
||||||
// Error frame, type should not be present
|
|
||||||
return r, nil
|
|
||||||
}
|
|
||||||
t, err := node.LookupByString("t")
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("missing 't': %w", err)
|
|
||||||
}
|
|
||||||
r.Type, err = t.AsString()
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("t.AsString(): %w", err)
|
|
||||||
}
|
|
||||||
return r, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseError(node datamodel.Node) (xrpc.XRPCError, error) {
|
|
||||||
r := xrpc.XRPCError{}
|
|
||||||
e, err := node.LookupByString("error")
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("missing 'error': %w", err)
|
|
||||||
}
|
|
||||||
r.ErrStr, err = e.AsString()
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("error.AsString(): %w", err)
|
|
||||||
}
|
|
||||||
m, err := node.LookupByString("message")
|
|
||||||
if err == nil {
|
|
||||||
r.Message, err = m.AsString()
|
|
||||||
if err != nil {
|
|
||||||
return r, fmt.Errorf("message.AsString(): %w", err)
|
|
||||||
}
|
|
||||||
} else if !errors.Is(err, datamodel.ErrNotExists{}) {
|
|
||||||
return r, fmt.Errorf("looking up 'message': %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return r, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func exportEventTimestamp(ctx context.Context, remote string, timestamp string) {
|
|
||||||
if t, err := time.Parse(time.RFC3339, timestamp); err != nil {
|
|
||||||
zerolog.Ctx(ctx).Error().Err(err).Str("pds", remote).Msgf("Failed to parse %q as a timestamp: %s", timestamp, err)
|
|
||||||
} else {
|
|
||||||
lastEventTimestamp.WithLabelValues(remote).Set(float64(t.Unix()))
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,268 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
"net/http"
|
|
||||||
_ "net/http/pprof"
|
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"runtime/debug"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
_ "github.com/joho/godotenv/autoload"
|
|
||||||
"github.com/kelseyhightower/envconfig"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/driver/postgres"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
"gorm.io/gorm/logger"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/util/gormzerolog"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Config struct {
|
|
||||||
LogFile string
|
|
||||||
LogFormat string `default:"text"`
|
|
||||||
LogLevel int64 `default:"1"`
|
|
||||||
MetricsPort string `split_words:"true"`
|
|
||||||
DBUrl string `envconfig:"POSTGRES_URL"`
|
|
||||||
Relays string
|
|
||||||
}
|
|
||||||
|
|
||||||
var config Config
|
|
||||||
|
|
||||||
func runMain(ctx context.Context) error {
|
|
||||||
ctx = setupLogging(ctx)
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
log.Debug().Msgf("Starting up...")
|
|
||||||
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
|
|
||||||
Logger: gormzerolog.New(&logger.Config{
|
|
||||||
SlowThreshold: 3 * time.Second,
|
|
||||||
IgnoreRecordNotFoundError: true,
|
|
||||||
}, nil),
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("connecting to the database: %w", err)
|
|
||||||
}
|
|
||||||
log.Debug().Msgf("DB connection established")
|
|
||||||
|
|
||||||
if config.Relays != "" {
|
|
||||||
for _, host := range strings.Split(config.Relays, ",") {
|
|
||||||
c, err := NewRelayConsumer(ctx, host, db)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to create relay consumer for %q: %s", host, err)
|
|
||||||
}
|
|
||||||
c.Start(ctx)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
consumersCh := make(chan struct{})
|
|
||||||
go runConsumers(ctx, db, consumersCh)
|
|
||||||
|
|
||||||
log.Info().Msgf("Starting HTTP listener on %q...", config.MetricsPort)
|
|
||||||
http.Handle("/metrics", promhttp.Handler())
|
|
||||||
srv := &http.Server{Addr: fmt.Sprintf(":%s", config.MetricsPort)}
|
|
||||||
errCh := make(chan error)
|
|
||||||
go func() {
|
|
||||||
errCh <- srv.ListenAndServe()
|
|
||||||
}()
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
if err := srv.Shutdown(context.Background()); err != nil {
|
|
||||||
return fmt.Errorf("HTTP server shutdown failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.Info().Msgf("Waiting for consumers to stop...")
|
|
||||||
<-consumersCh
|
|
||||||
return <-errCh
|
|
||||||
}
|
|
||||||
|
|
||||||
func runConsumers(ctx context.Context, db *gorm.DB, doneCh chan struct{}) {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
defer close(doneCh)
|
|
||||||
|
|
||||||
type consumerHandle struct {
|
|
||||||
cancel context.CancelFunc
|
|
||||||
consumer *Consumer
|
|
||||||
}
|
|
||||||
|
|
||||||
running := map[string]consumerHandle{}
|
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Minute)
|
|
||||||
defer ticker.Stop()
|
|
||||||
t := make(chan time.Time, 1)
|
|
||||||
t <- time.Now()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-t:
|
|
||||||
remotes := []pds.PDS{}
|
|
||||||
if err := db.Find(&remotes).Error; err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to get a list of known PDSs: %s", err)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
shouldBeRunning := map[string]pds.PDS{}
|
|
||||||
for _, remote := range remotes {
|
|
||||||
if remote.Disabled {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
shouldBeRunning[remote.Host] = remote
|
|
||||||
}
|
|
||||||
|
|
||||||
for host, handle := range running {
|
|
||||||
if _, found := shouldBeRunning[host]; found {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
handle.cancel()
|
|
||||||
_ = handle.consumer.Wait(ctx)
|
|
||||||
delete(running, host)
|
|
||||||
}
|
|
||||||
|
|
||||||
for host, remote := range shouldBeRunning {
|
|
||||||
if _, found := running[host]; found {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
subCtx, cancel := context.WithCancel(ctx)
|
|
||||||
|
|
||||||
c, err := NewConsumer(subCtx, &remote, db)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to create a consumer for %q: %s", remote.Host, err)
|
|
||||||
cancel()
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err := c.Start(subCtx); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed ot start a consumer for %q: %s", remote.Host, err)
|
|
||||||
cancel()
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
running[host] = consumerHandle{
|
|
||||||
cancel: cancel,
|
|
||||||
consumer: c,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
case <-ctx.Done():
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
for host, handle := range running {
|
|
||||||
wg.Add(1)
|
|
||||||
go func(handle consumerHandle) {
|
|
||||||
handle.cancel()
|
|
||||||
_ = handle.consumer.Wait(ctx)
|
|
||||||
wg.Done()
|
|
||||||
}(handle)
|
|
||||||
delete(running, host)
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
case v := <-ticker.C:
|
|
||||||
// Non-blocking send.
|
|
||||||
select {
|
|
||||||
case t <- v:
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
|
|
||||||
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
|
|
||||||
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
|
|
||||||
flag.StringVar(&config.Relays, "relays", "", "List of relays to connect to (for discovering new PDSs)")
|
|
||||||
|
|
||||||
if err := envconfig.Process("consumer", &config); err != nil {
|
|
||||||
log.Fatalf("envconfig.Process: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
||||||
if err := runMain(ctx); err != nil {
|
|
||||||
fmt.Fprintln(os.Stderr, err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func setupLogging(ctx context.Context) context.Context {
|
|
||||||
logFile := os.Stderr
|
|
||||||
|
|
||||||
if config.LogFile != "" {
|
|
||||||
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
|
|
||||||
}
|
|
||||||
logFile = f
|
|
||||||
}
|
|
||||||
|
|
||||||
var output io.Writer
|
|
||||||
|
|
||||||
switch config.LogFormat {
|
|
||||||
case "json":
|
|
||||||
output = logFile
|
|
||||||
case "text":
|
|
||||||
prefixList := []string{}
|
|
||||||
info, ok := debug.ReadBuildInfo()
|
|
||||||
if ok {
|
|
||||||
prefixList = append(prefixList, info.Path+"/")
|
|
||||||
}
|
|
||||||
|
|
||||||
basedir := ""
|
|
||||||
_, sourceFile, _, ok := runtime.Caller(0)
|
|
||||||
if ok {
|
|
||||||
basedir = filepath.Dir(sourceFile)
|
|
||||||
}
|
|
||||||
|
|
||||||
if basedir != "" && strings.HasPrefix(basedir, "/") {
|
|
||||||
prefixList = append(prefixList, basedir+"/")
|
|
||||||
head, _ := filepath.Split(basedir)
|
|
||||||
for head != "/" {
|
|
||||||
prefixList = append(prefixList, head)
|
|
||||||
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
output = zerolog.ConsoleWriter{
|
|
||||||
Out: logFile,
|
|
||||||
NoColor: true,
|
|
||||||
TimeFormat: time.RFC3339,
|
|
||||||
PartsOrder: []string{
|
|
||||||
zerolog.LevelFieldName,
|
|
||||||
zerolog.TimestampFieldName,
|
|
||||||
zerolog.CallerFieldName,
|
|
||||||
zerolog.MessageFieldName,
|
|
||||||
},
|
|
||||||
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
|
|
||||||
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
|
|
||||||
FormatCaller: func(i interface{}) string {
|
|
||||||
s := i.(string)
|
|
||||||
for _, p := range prefixList {
|
|
||||||
s = strings.TrimPrefix(s, p)
|
|
||||||
}
|
|
||||||
return s
|
|
||||||
},
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
|
|
||||||
|
|
||||||
ctx = logger.WithContext(ctx)
|
|
||||||
|
|
||||||
zerolog.DefaultContextLogger = &logger
|
|
||||||
log.SetOutput(logger)
|
|
||||||
|
|
||||||
return ctx
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
||||||
)
|
|
||||||
|
|
||||||
var lastEventTimestamp = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
||||||
Name: "repo_commit_received_timestamp",
|
|
||||||
Help: "Timestamp of the last event received from firehose.",
|
|
||||||
}, []string{"remote"})
|
|
||||||
|
|
||||||
var eventCounter = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "repo_commits_received_counter",
|
|
||||||
Help: "Counter of events received from each remote.",
|
|
||||||
}, []string{"remote", "type"})
|
|
||||||
|
|
||||||
var reposDiscovered = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "repo_discovered_counter",
|
|
||||||
Help: "Counter of newly discovered repos",
|
|
||||||
}, []string{"remote"})
|
|
||||||
|
|
||||||
var postsByLanguageIndexed = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "indexer_posts_by_language_count",
|
|
||||||
Help: "Number of posts by language",
|
|
||||||
}, []string{"remote", "lang"})
|
|
||||||
|
|
||||||
var connectionFailures = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "consumer_connection_failures",
|
|
||||||
Help: "Counter of firehose connection failures",
|
|
||||||
}, []string{"remote"})
|
|
||||||
|
|
||||||
var pdsOnline = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
||||||
Name: "consumer_connection_up",
|
|
||||||
Help: "Status of a connection. 1 - up and running.",
|
|
||||||
}, []string{"remote"})
|
|
|
@ -1,179 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
"net/url"
|
|
||||||
"path"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
comatproto "github.com/bluesky-social/indigo/api/atproto"
|
|
||||||
"github.com/gorilla/websocket"
|
|
||||||
"github.com/ipld/go-ipld-prime/codec/dagcbor"
|
|
||||||
"github.com/ipld/go-ipld-prime/node/basicnode"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/util/resolver"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
)
|
|
||||||
|
|
||||||
type RelayConsumer struct {
|
|
||||||
url string
|
|
||||||
db *gorm.DB
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewRelayConsumer(ctx context.Context, host string, db *gorm.DB) (*RelayConsumer, error) {
|
|
||||||
addr, err := url.Parse(host)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("parsing URL %q: %s", host, err)
|
|
||||||
}
|
|
||||||
addr.Scheme = "wss"
|
|
||||||
addr.Path = path.Join(addr.Path, "xrpc/com.atproto.sync.subscribeRepos")
|
|
||||||
return &RelayConsumer{db: db, url: addr.String()}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *RelayConsumer) Start(ctx context.Context) {
|
|
||||||
go c.run(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *RelayConsumer) run(ctx context.Context) {
|
|
||||||
log := zerolog.Ctx(ctx).With().Str("relay", c.url).Logger()
|
|
||||||
ctx = log.WithContext(ctx)
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
log.Info().Msgf("Relay consumer stopped")
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
if err := c.runOnce(ctx); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Consumer of relay %q failed (will be restarted): %s", c.url, err)
|
|
||||||
}
|
|
||||||
time.Sleep(time.Second)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *RelayConsumer) runOnce(ctx context.Context) error {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
|
|
||||||
conn, _, err := websocket.DefaultDialer.DialContext(ctx, c.url, http.Header{})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("establishing websocker connection: %w", err)
|
|
||||||
}
|
|
||||||
defer conn.Close()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
default:
|
|
||||||
_, b, err := conn.ReadMessage()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("websocket.ReadMessage: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
r := bytes.NewReader(b)
|
|
||||||
proto := basicnode.Prototype.Any
|
|
||||||
headerNode := proto.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true}).Decode(headerNode, r); err != nil {
|
|
||||||
return fmt.Errorf("unmarshaling message header: %w", err)
|
|
||||||
}
|
|
||||||
header, err := parseHeader(headerNode.Build())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("parsing message header: %w", err)
|
|
||||||
}
|
|
||||||
switch header.Op {
|
|
||||||
case 1:
|
|
||||||
if err := c.processMessage(ctx, header.Type, r); err != nil {
|
|
||||||
log.Info().Err(err).Msgf("Relay consumer failed to process a message: %s", err)
|
|
||||||
}
|
|
||||||
case -1:
|
|
||||||
bodyNode := proto.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true, AllowLinks: true}).Decode(bodyNode, r); err != nil {
|
|
||||||
return fmt.Errorf("unmarshaling message body: %w", err)
|
|
||||||
}
|
|
||||||
body, err := parseError(bodyNode.Build())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("parsing error payload: %w", err)
|
|
||||||
}
|
|
||||||
return &body
|
|
||||||
default:
|
|
||||||
log.Warn().Msgf("Unknown 'op' value received: %d", header.Op)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *RelayConsumer) processMessage(ctx context.Context, typ string, r io.Reader) error {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
|
|
||||||
did := ""
|
|
||||||
|
|
||||||
switch typ {
|
|
||||||
case "#commit":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Commit{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
did = payload.Repo
|
|
||||||
|
|
||||||
case "#handle":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Handle{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
did = payload.Did
|
|
||||||
|
|
||||||
case "#migrate":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Migrate{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
did = payload.Did
|
|
||||||
|
|
||||||
case "#tombstone":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Tombstone{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
did = payload.Did
|
|
||||||
|
|
||||||
case "#info":
|
|
||||||
// Ignore
|
|
||||||
case "#identity":
|
|
||||||
payload := &comatproto.SyncSubscribeRepos_Identity{}
|
|
||||||
if err := payload.UnmarshalCBOR(r); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal commit: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
did = payload.Did
|
|
||||||
|
|
||||||
default:
|
|
||||||
b, err := io.ReadAll(r)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to read message payload: %s", err)
|
|
||||||
}
|
|
||||||
log.Warn().Msgf("Unknown message type received: %s payload=%q", typ, string(b))
|
|
||||||
}
|
|
||||||
|
|
||||||
if did == "" {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
u, _, err := resolver.GetPDSEndpointAndPublicKey(ctx, did)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
_, err = pds.EnsureExists(ctx, c.db, u.String())
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
|
@ -1,14 +0,0 @@
|
||||||
FROM golang:1.22.3 as builder
|
|
||||||
WORKDIR /app
|
|
||||||
COPY go.mod go.sum ./
|
|
||||||
RUN go mod download
|
|
||||||
COPY . ./
|
|
||||||
RUN go build -trimpath ./cmd/lister
|
|
||||||
|
|
||||||
FROM alpine:latest as certs
|
|
||||||
RUN apk --update add ca-certificates
|
|
||||||
|
|
||||||
FROM debian:stable-slim
|
|
||||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
|
||||||
COPY --from=builder /app/lister .
|
|
||||||
ENTRYPOINT ["./lister"]
|
|
|
@ -1,7 +0,0 @@
|
||||||
*
|
|
||||||
**/*
|
|
||||||
!go.mod
|
|
||||||
!go.sum
|
|
||||||
!**/*.go
|
|
||||||
cmd/**
|
|
||||||
!cmd/lister
|
|
|
@ -1,151 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
|
|
||||||
comatproto "github.com/bluesky-social/indigo/api/atproto"
|
|
||||||
"github.com/bluesky-social/indigo/did"
|
|
||||||
|
|
||||||
"github.com/uabluerail/bsky-tools/pagination"
|
|
||||||
"github.com/uabluerail/bsky-tools/xrpcauth"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/repo"
|
|
||||||
"github.com/uabluerail/indexer/util/resolver"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Lister struct {
|
|
||||||
db *gorm.DB
|
|
||||||
resolver did.Resolver
|
|
||||||
|
|
||||||
pollInterval time.Duration
|
|
||||||
listRefreshInterval time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewLister(ctx context.Context, db *gorm.DB) (*Lister, error) {
|
|
||||||
return &Lister{
|
|
||||||
db: db,
|
|
||||||
resolver: resolver.Resolver,
|
|
||||||
pollInterval: 5 * time.Minute,
|
|
||||||
listRefreshInterval: 24 * time.Hour,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Lister) Start(ctx context.Context) error {
|
|
||||||
go l.run(ctx)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Lister) run(ctx context.Context) {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
ticker := time.NewTicker(l.pollInterval)
|
|
||||||
|
|
||||||
log.Info().Msgf("Lister starting...")
|
|
||||||
t := make(chan time.Time, 1)
|
|
||||||
t <- time.Now()
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
log.Info().Msgf("Lister stopped (context expired)")
|
|
||||||
return
|
|
||||||
case <-t:
|
|
||||||
db := l.db.WithContext(ctx)
|
|
||||||
|
|
||||||
remote := pds.PDS{}
|
|
||||||
if err := db.Model(&remote).
|
|
||||||
Where("(disabled=false or disabled is null) and (last_list is null or last_list < ?)", time.Now().Add(-l.listRefreshInterval)).
|
|
||||||
Take(&remote).Error; err != nil {
|
|
||||||
if !errors.Is(err, gorm.ErrRecordNotFound) {
|
|
||||||
log.Error().Err(err).Msgf("Failed to query DB for a PDS to list repos from: %s", err)
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
if !pds.IsWhitelisted(remote.Host) {
|
|
||||||
log.Info().Msgf("PDS %q is not whitelisted, disabling it", remote.Host)
|
|
||||||
if err := db.Model(&remote).Where(&pds.PDS{ID: remote.ID}).Updates(&pds.PDS{Disabled: true}).Error; err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to disable PDS %q: %s", remote.Host, err)
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
client := xrpcauth.NewAnonymousClient(ctx)
|
|
||||||
client.Host = remote.Host
|
|
||||||
|
|
||||||
log.Info().Msgf("Listing repos from %q...", remote.Host)
|
|
||||||
repos, err := pagination.Reduce(
|
|
||||||
func(cursor string) (resp *comatproto.SyncListRepos_Output, nextCursor string, err error) {
|
|
||||||
resp, err = comatproto.SyncListRepos(ctx, client, cursor, 200)
|
|
||||||
if err == nil && resp.Cursor != nil {
|
|
||||||
nextCursor = *resp.Cursor
|
|
||||||
}
|
|
||||||
return
|
|
||||||
},
|
|
||||||
func(resp *comatproto.SyncListRepos_Output, acc []*comatproto.SyncListRepos_Repo) ([]*comatproto.SyncListRepos_Repo, error) {
|
|
||||||
for _, repo := range resp.Repos {
|
|
||||||
if repo == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
acc = append(acc, repo)
|
|
||||||
}
|
|
||||||
return acc, nil
|
|
||||||
})
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to list repos from %q: %s", remote.Host, err)
|
|
||||||
// Update the timestamp so we don't get stuck on a single broken PDS
|
|
||||||
if err := db.Model(&remote).Updates(&pds.PDS{LastList: time.Now()}).Error; err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to update the timestamp of last list for %q: %s", remote.Host, err)
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
log.Info().Msgf("Received %d DIDs from %q", len(repos), remote.Host)
|
|
||||||
reposListed.WithLabelValues(remote.Host).Add(float64(len(repos)))
|
|
||||||
|
|
||||||
for _, repoInfo := range repos {
|
|
||||||
record, created, err := repo.EnsureExists(ctx, l.db, repoInfo.Did)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to ensure that we have a record for the repo %q: %s", repoInfo.Did, err)
|
|
||||||
} else if created {
|
|
||||||
reposDiscovered.WithLabelValues(remote.Host).Inc()
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == nil && record.FirstRevSinceReset == "" {
|
|
||||||
// Populate this field in case it's empty, so we don't have to wait for the first firehose event
|
|
||||||
// to trigger a resync.
|
|
||||||
err := l.db.Transaction(func(tx *gorm.DB) error {
|
|
||||||
var currentRecord repo.Repo
|
|
||||||
if err := tx.Model(&record).Where(&repo.Repo{ID: record.ID}).Take(¤tRecord).Error; err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if currentRecord.FirstRevSinceReset != "" {
|
|
||||||
// Someone else already updated it, nothing to do.
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
var remote pds.PDS
|
|
||||||
if err := tx.Model(&remote).Where(&pds.PDS{ID: record.PDS}).Take(&remote).Error; err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return tx.Model(&record).Where(&repo.Repo{ID: record.ID}).Updates(&repo.Repo{
|
|
||||||
FirstRevSinceReset: repoInfo.Rev,
|
|
||||||
FirstCursorSinceReset: remote.FirstCursorSinceReset,
|
|
||||||
}).Error
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to set the initial FirstRevSinceReset value for %q: %s", repoInfo.Did, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := db.Model(&remote).Updates(&pds.PDS{LastList: time.Now()}).Error; err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to update the timestamp of last list for %q: %s", remote.Host, err)
|
|
||||||
}
|
|
||||||
case v := <-ticker.C:
|
|
||||||
t <- v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,168 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
"net/http"
|
|
||||||
_ "net/http/pprof"
|
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"runtime/debug"
|
|
||||||
"strings"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
_ "github.com/joho/godotenv/autoload"
|
|
||||||
"github.com/kelseyhightower/envconfig"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/driver/postgres"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
"gorm.io/gorm/logger"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/util/gormzerolog"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Config struct {
|
|
||||||
LogFile string
|
|
||||||
LogFormat string `default:"text"`
|
|
||||||
LogLevel int64 `default:"1"`
|
|
||||||
MetricsPort string `split_words:"true"`
|
|
||||||
DBUrl string `envconfig:"POSTGRES_URL"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var config Config
|
|
||||||
|
|
||||||
func runMain(ctx context.Context) error {
|
|
||||||
ctx = setupLogging(ctx)
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
log.Debug().Msgf("Starting up...")
|
|
||||||
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
|
|
||||||
Logger: gormzerolog.New(&logger.Config{
|
|
||||||
SlowThreshold: 1 * time.Second,
|
|
||||||
IgnoreRecordNotFoundError: true,
|
|
||||||
}, nil),
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("connecting to the database: %w", err)
|
|
||||||
}
|
|
||||||
log.Debug().Msgf("DB connection established")
|
|
||||||
|
|
||||||
lister, err := NewLister(ctx, db)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to create lister: %w", err)
|
|
||||||
}
|
|
||||||
if err := lister.Start(ctx); err != nil {
|
|
||||||
return fmt.Errorf("failed to start lister: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Info().Msgf("Starting HTTP listener on %q...", config.MetricsPort)
|
|
||||||
http.Handle("/metrics", promhttp.Handler())
|
|
||||||
srv := &http.Server{Addr: fmt.Sprintf(":%s", config.MetricsPort)}
|
|
||||||
errCh := make(chan error)
|
|
||||||
go func() {
|
|
||||||
errCh <- srv.ListenAndServe()
|
|
||||||
}()
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
if err := srv.Shutdown(context.Background()); err != nil {
|
|
||||||
return fmt.Errorf("HTTP server shutdown failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return <-errCh
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
|
|
||||||
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
|
|
||||||
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
|
|
||||||
|
|
||||||
if err := envconfig.Process("lister", &config); err != nil {
|
|
||||||
log.Fatalf("envconfig.Process: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
||||||
if err := runMain(ctx); err != nil {
|
|
||||||
fmt.Fprintln(os.Stderr, err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func setupLogging(ctx context.Context) context.Context {
|
|
||||||
logFile := os.Stderr
|
|
||||||
|
|
||||||
if config.LogFile != "" {
|
|
||||||
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
|
|
||||||
}
|
|
||||||
logFile = f
|
|
||||||
}
|
|
||||||
|
|
||||||
var output io.Writer
|
|
||||||
|
|
||||||
switch config.LogFormat {
|
|
||||||
case "json":
|
|
||||||
output = logFile
|
|
||||||
case "text":
|
|
||||||
prefixList := []string{}
|
|
||||||
info, ok := debug.ReadBuildInfo()
|
|
||||||
if ok {
|
|
||||||
prefixList = append(prefixList, info.Path+"/")
|
|
||||||
}
|
|
||||||
|
|
||||||
basedir := ""
|
|
||||||
_, sourceFile, _, ok := runtime.Caller(0)
|
|
||||||
if ok {
|
|
||||||
basedir = filepath.Dir(sourceFile)
|
|
||||||
}
|
|
||||||
|
|
||||||
if basedir != "" && strings.HasPrefix(basedir, "/") {
|
|
||||||
prefixList = append(prefixList, basedir+"/")
|
|
||||||
head, _ := filepath.Split(basedir)
|
|
||||||
for head != "/" {
|
|
||||||
prefixList = append(prefixList, head)
|
|
||||||
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
output = zerolog.ConsoleWriter{
|
|
||||||
Out: logFile,
|
|
||||||
NoColor: true,
|
|
||||||
TimeFormat: time.RFC3339,
|
|
||||||
PartsOrder: []string{
|
|
||||||
zerolog.LevelFieldName,
|
|
||||||
zerolog.TimestampFieldName,
|
|
||||||
zerolog.CallerFieldName,
|
|
||||||
zerolog.MessageFieldName,
|
|
||||||
},
|
|
||||||
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
|
|
||||||
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
|
|
||||||
FormatCaller: func(i interface{}) string {
|
|
||||||
s := i.(string)
|
|
||||||
for _, p := range prefixList {
|
|
||||||
s = strings.TrimPrefix(s, p)
|
|
||||||
}
|
|
||||||
return s
|
|
||||||
},
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
|
|
||||||
|
|
||||||
ctx = logger.WithContext(ctx)
|
|
||||||
|
|
||||||
zerolog.DefaultContextLogger = &logger
|
|
||||||
log.SetOutput(logger)
|
|
||||||
|
|
||||||
return ctx
|
|
||||||
}
|
|
|
@ -1,16 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
||||||
)
|
|
||||||
|
|
||||||
var reposDiscovered = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "repo_discovered_counter",
|
|
||||||
Help: "Counter of newly discovered repos",
|
|
||||||
}, []string{"remote"})
|
|
||||||
|
|
||||||
var reposListed = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "repo_listed_counter",
|
|
||||||
Help: "Counter of repos received by listing PDSs.",
|
|
||||||
}, []string{"remote"})
|
|
|
@ -1,14 +0,0 @@
|
||||||
FROM golang:1.22.3 as builder
|
|
||||||
WORKDIR /app
|
|
||||||
COPY go.mod go.sum ./
|
|
||||||
RUN go mod download
|
|
||||||
COPY . ./
|
|
||||||
RUN go build -trimpath ./cmd/record-indexer
|
|
||||||
|
|
||||||
FROM alpine:latest as certs
|
|
||||||
RUN apk --update add ca-certificates
|
|
||||||
|
|
||||||
FROM debian:stable-slim
|
|
||||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
|
||||||
COPY --from=builder /app/record-indexer .
|
|
||||||
ENTRYPOINT ["./record-indexer"]
|
|
|
@ -1,7 +0,0 @@
|
||||||
*
|
|
||||||
**/*
|
|
||||||
!go.mod
|
|
||||||
!go.sum
|
|
||||||
!**/*.go
|
|
||||||
cmd/**
|
|
||||||
!cmd/record-indexer
|
|
|
@ -1,77 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"net/http"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"golang.org/x/time/rate"
|
|
||||||
)
|
|
||||||
|
|
||||||
func AddAdminHandlers(limiter *Limiter, pool *WorkerPool) {
|
|
||||||
http.HandleFunc("/rate/set", handleRateSet(limiter))
|
|
||||||
http.HandleFunc("/rate/setAll", handleRateSetAll(limiter))
|
|
||||||
http.HandleFunc("/pool/resize", handlePoolResize(pool))
|
|
||||||
}
|
|
||||||
|
|
||||||
func handlePoolResize(pool *WorkerPool) http.HandlerFunc {
|
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
s := r.FormValue("size")
|
|
||||||
if s == "" {
|
|
||||||
http.Error(w, "need size", http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
size, err := strconv.Atoi(s)
|
|
||||||
if err != nil {
|
|
||||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
pool.Resize(context.Background(), size)
|
|
||||||
fmt.Fprintln(w, "OK")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func handleRateSet(limiter *Limiter) http.HandlerFunc {
|
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
s := r.FormValue("limit")
|
|
||||||
if s == "" {
|
|
||||||
http.Error(w, "need limit", http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
name := r.FormValue("name")
|
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "need name", http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
limit, err := strconv.Atoi(s)
|
|
||||||
if err != nil {
|
|
||||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
limiter.SetLimit(context.Background(), name, rate.Limit(limit))
|
|
||||||
fmt.Fprintln(w, "OK")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func handleRateSetAll(limiter *Limiter) http.HandlerFunc {
|
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
s := r.FormValue("limit")
|
|
||||||
if s == "" {
|
|
||||||
http.Error(w, "need limit", http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
limit, err := strconv.Atoi(s)
|
|
||||||
if err != nil {
|
|
||||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
limiter.SetAllLimits(context.Background(), rate.Limit(limit))
|
|
||||||
fmt.Fprintln(w, "OK")
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,179 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
"net/http"
|
|
||||||
_ "net/http/pprof"
|
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"runtime/debug"
|
|
||||||
"strings"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
_ "github.com/joho/godotenv/autoload"
|
|
||||||
"github.com/kelseyhightower/envconfig"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/driver/postgres"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
"gorm.io/gorm/logger"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/util/gormzerolog"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Config struct {
|
|
||||||
LogFile string
|
|
||||||
LogFormat string `default:"text"`
|
|
||||||
LogLevel int64 `default:"1"`
|
|
||||||
MetricsPort string `split_words:"true"`
|
|
||||||
DBUrl string `envconfig:"POSTGRES_URL"`
|
|
||||||
Workers int `default:"2"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var config Config
|
|
||||||
|
|
||||||
func runMain(ctx context.Context) error {
|
|
||||||
ctx = setupLogging(ctx)
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
log.Debug().Msgf("Starting up...")
|
|
||||||
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
|
|
||||||
Logger: gormzerolog.New(&logger.Config{
|
|
||||||
SlowThreshold: 3 * time.Second,
|
|
||||||
IgnoreRecordNotFoundError: true,
|
|
||||||
}, nil),
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("connecting to the database: %w", err)
|
|
||||||
}
|
|
||||||
log.Debug().Msgf("DB connection established")
|
|
||||||
|
|
||||||
limiter, err := NewLimiter(db)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to create limiter: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
ch := make(chan WorkItem)
|
|
||||||
pool := NewWorkerPool(ch, db, config.Workers, limiter)
|
|
||||||
if err := pool.Start(ctx); err != nil {
|
|
||||||
return fmt.Errorf("failed to start worker pool: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
scheduler := NewScheduler(ch, db)
|
|
||||||
if err := scheduler.Start(ctx); err != nil {
|
|
||||||
return fmt.Errorf("failed to start scheduler: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Info().Msgf("Starting HTTP listener on %q...", config.MetricsPort)
|
|
||||||
AddAdminHandlers(limiter, pool)
|
|
||||||
http.Handle("/metrics", promhttp.Handler())
|
|
||||||
srv := &http.Server{Addr: fmt.Sprintf(":%s", config.MetricsPort)}
|
|
||||||
errCh := make(chan error)
|
|
||||||
go func() {
|
|
||||||
errCh <- srv.ListenAndServe()
|
|
||||||
}()
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
if err := srv.Shutdown(context.Background()); err != nil {
|
|
||||||
return fmt.Errorf("HTTP server shutdown failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return <-errCh
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
|
|
||||||
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
|
|
||||||
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
|
|
||||||
flag.IntVar(&config.Workers, "workers", 2, "Number of workers to start with")
|
|
||||||
|
|
||||||
if err := envconfig.Process("indexer", &config); err != nil {
|
|
||||||
log.Fatalf("envconfig.Process: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
||||||
if err := runMain(ctx); err != nil {
|
|
||||||
fmt.Fprintln(os.Stderr, err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func setupLogging(ctx context.Context) context.Context {
|
|
||||||
logFile := os.Stderr
|
|
||||||
|
|
||||||
if config.LogFile != "" {
|
|
||||||
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
|
|
||||||
}
|
|
||||||
logFile = f
|
|
||||||
}
|
|
||||||
|
|
||||||
var output io.Writer
|
|
||||||
|
|
||||||
switch config.LogFormat {
|
|
||||||
case "json":
|
|
||||||
output = logFile
|
|
||||||
case "text":
|
|
||||||
prefixList := []string{}
|
|
||||||
info, ok := debug.ReadBuildInfo()
|
|
||||||
if ok {
|
|
||||||
prefixList = append(prefixList, info.Path+"/")
|
|
||||||
}
|
|
||||||
|
|
||||||
basedir := ""
|
|
||||||
_, sourceFile, _, ok := runtime.Caller(0)
|
|
||||||
if ok {
|
|
||||||
basedir = filepath.Dir(sourceFile)
|
|
||||||
}
|
|
||||||
|
|
||||||
if basedir != "" && strings.HasPrefix(basedir, "/") {
|
|
||||||
prefixList = append(prefixList, basedir+"/")
|
|
||||||
head, _ := filepath.Split(basedir)
|
|
||||||
for head != "/" {
|
|
||||||
prefixList = append(prefixList, head)
|
|
||||||
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
output = zerolog.ConsoleWriter{
|
|
||||||
Out: logFile,
|
|
||||||
NoColor: true,
|
|
||||||
TimeFormat: time.RFC3339,
|
|
||||||
PartsOrder: []string{
|
|
||||||
zerolog.LevelFieldName,
|
|
||||||
zerolog.TimestampFieldName,
|
|
||||||
zerolog.CallerFieldName,
|
|
||||||
zerolog.MessageFieldName,
|
|
||||||
},
|
|
||||||
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
|
|
||||||
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
|
|
||||||
FormatCaller: func(i interface{}) string {
|
|
||||||
s := i.(string)
|
|
||||||
for _, p := range prefixList {
|
|
||||||
s = strings.TrimPrefix(s, p)
|
|
||||||
}
|
|
||||||
return s
|
|
||||||
},
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
|
|
||||||
|
|
||||||
ctx = logger.WithContext(ctx)
|
|
||||||
|
|
||||||
zerolog.DefaultContextLogger = &logger
|
|
||||||
log.SetOutput(logger)
|
|
||||||
|
|
||||||
return ctx
|
|
||||||
}
|
|
|
@ -1,41 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
||||||
)
|
|
||||||
|
|
||||||
var reposQueued = promauto.NewCounter(prometheus.CounterOpts{
|
|
||||||
Name: "indexer_repos_queued_count",
|
|
||||||
Help: "Number of repos added to the queue",
|
|
||||||
})
|
|
||||||
|
|
||||||
var queueLenght = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
||||||
Name: "indexer_queue_length",
|
|
||||||
Help: "Current length of indexing queue",
|
|
||||||
}, []string{"state"})
|
|
||||||
|
|
||||||
var reposFetched = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "indexer_repos_fetched_count",
|
|
||||||
Help: "Number of repos fetched",
|
|
||||||
}, []string{"remote", "success"})
|
|
||||||
|
|
||||||
var reposIndexed = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "indexer_repos_indexed_count",
|
|
||||||
Help: "Number of repos indexed",
|
|
||||||
}, []string{"success"})
|
|
||||||
|
|
||||||
var recordsFetched = promauto.NewCounter(prometheus.CounterOpts{
|
|
||||||
Name: "indexer_records_fetched_count",
|
|
||||||
Help: "Number of records fetched",
|
|
||||||
})
|
|
||||||
|
|
||||||
var recordsInserted = promauto.NewCounter(prometheus.CounterOpts{
|
|
||||||
Name: "indexer_records_inserted_count",
|
|
||||||
Help: "Number of records inserted into DB",
|
|
||||||
})
|
|
||||||
|
|
||||||
var workerPoolSize = promauto.NewGauge(prometheus.GaugeOpts{
|
|
||||||
Name: "indexer_workers_count",
|
|
||||||
Help: "Current number of workers running",
|
|
||||||
})
|
|
|
@ -1,82 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"golang.org/x/time/rate"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
)
|
|
||||||
|
|
||||||
const defaultRateLimit = 10
|
|
||||||
|
|
||||||
type Limiter struct {
|
|
||||||
mu sync.RWMutex
|
|
||||||
db *gorm.DB
|
|
||||||
limiter map[string]*rate.Limiter
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewLimiter(db *gorm.DB) (*Limiter, error) {
|
|
||||||
remotes := []pds.PDS{}
|
|
||||||
|
|
||||||
if err := db.Find(&remotes).Error; err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get the list of known PDSs: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
l := &Limiter{
|
|
||||||
db: db,
|
|
||||||
limiter: map[string]*rate.Limiter{},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, remote := range remotes {
|
|
||||||
limit := remote.CrawlLimit
|
|
||||||
if limit == 0 {
|
|
||||||
limit = defaultRateLimit
|
|
||||||
}
|
|
||||||
l.limiter[remote.Host] = rate.NewLimiter(rate.Limit(limit), limit*2)
|
|
||||||
}
|
|
||||||
return l, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Limiter) getLimiter(name string) *rate.Limiter {
|
|
||||||
l.mu.RLock()
|
|
||||||
limiter := l.limiter[name]
|
|
||||||
l.mu.RUnlock()
|
|
||||||
|
|
||||||
if limiter != nil {
|
|
||||||
return limiter
|
|
||||||
}
|
|
||||||
|
|
||||||
limiter = rate.NewLimiter(defaultRateLimit, defaultRateLimit*2)
|
|
||||||
l.mu.Lock()
|
|
||||||
l.limiter[name] = limiter
|
|
||||||
l.mu.Unlock()
|
|
||||||
return limiter
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Limiter) Wait(ctx context.Context, name string) error {
|
|
||||||
return l.getLimiter(name).Wait(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Limiter) SetLimit(ctx context.Context, name string, limit rate.Limit) {
|
|
||||||
l.getLimiter(name).SetLimit(limit)
|
|
||||||
err := l.db.Model(&pds.PDS{}).Where(&pds.PDS{Host: name}).Updates(&pds.PDS{CrawlLimit: int(limit)}).Error
|
|
||||||
if err != nil {
|
|
||||||
zerolog.Ctx(ctx).Error().Err(err).Msgf("Failed to persist rate limit change for %q: %s", name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Limiter) SetAllLimits(ctx context.Context, limit rate.Limit) {
|
|
||||||
l.mu.RLock()
|
|
||||||
for name, limiter := range l.limiter {
|
|
||||||
limiter.SetLimit(limit)
|
|
||||||
err := l.db.Model(&pds.PDS{}).Where(&pds.PDS{Host: name}).Updates(&pds.PDS{CrawlLimit: int(limit)}).Error
|
|
||||||
if err != nil {
|
|
||||||
zerolog.Ctx(ctx).Error().Err(err).Msgf("Failed to persist rate limit change for %q: %s", name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
l.mu.RUnlock()
|
|
||||||
}
|
|
|
@ -1,158 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"slices"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/repo"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Scheduler struct {
|
|
||||||
db *gorm.DB
|
|
||||||
output chan<- WorkItem
|
|
||||||
|
|
||||||
queue map[string]*repo.Repo
|
|
||||||
inProgress map[string]*repo.Repo
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewScheduler(output chan<- WorkItem, db *gorm.DB) *Scheduler {
|
|
||||||
return &Scheduler{
|
|
||||||
db: db,
|
|
||||||
output: output,
|
|
||||||
queue: map[string]*repo.Repo{},
|
|
||||||
inProgress: map[string]*repo.Repo{},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Scheduler) Start(ctx context.Context) error {
|
|
||||||
go s.run(ctx)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Scheduler) run(ctx context.Context) {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
t := time.NewTicker(time.Minute)
|
|
||||||
defer t.Stop()
|
|
||||||
|
|
||||||
if err := s.fillQueue(ctx); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to get more tasks for the queue: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
done := make(chan string)
|
|
||||||
for {
|
|
||||||
if len(s.queue) > 0 {
|
|
||||||
next := WorkItem{signal: make(chan struct{})}
|
|
||||||
for _, r := range s.queue {
|
|
||||||
next.Repo = r
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
case <-t.C:
|
|
||||||
if err := s.fillQueue(ctx); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to get more tasks for the queue: %s", err)
|
|
||||||
}
|
|
||||||
case s.output <- next:
|
|
||||||
delete(s.queue, next.Repo.DID)
|
|
||||||
s.inProgress[next.Repo.DID] = next.Repo
|
|
||||||
go func(did string, ch chan struct{}) {
|
|
||||||
select {
|
|
||||||
case <-ch:
|
|
||||||
case <-ctx.Done():
|
|
||||||
}
|
|
||||||
done <- did
|
|
||||||
}(next.Repo.DID, next.signal)
|
|
||||||
s.updateQueueLenMetrics()
|
|
||||||
case did := <-done:
|
|
||||||
delete(s.inProgress, did)
|
|
||||||
s.updateQueueLenMetrics()
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
case <-t.C:
|
|
||||||
if err := s.fillQueue(ctx); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to get more tasks for the queue: %s", err)
|
|
||||||
}
|
|
||||||
case did := <-done:
|
|
||||||
delete(s.inProgress, did)
|
|
||||||
s.updateQueueLenMetrics()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Scheduler) fillQueue(ctx context.Context) error {
|
|
||||||
const maxQueueLen = 10000
|
|
||||||
const maxAttempts = 3
|
|
||||||
|
|
||||||
if len(s.queue)+len(s.inProgress) >= maxQueueLen {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
remotes := []pds.PDS{}
|
|
||||||
if err := s.db.Find(&remotes).Error; err != nil {
|
|
||||||
return fmt.Errorf("failed to get the list of PDSs: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
remotes = slices.DeleteFunc(remotes, func(pds pds.PDS) bool {
|
|
||||||
return pds.Disabled
|
|
||||||
})
|
|
||||||
perPDSLimit := maxQueueLen
|
|
||||||
if len(remotes) > 0 {
|
|
||||||
perPDSLimit = maxQueueLen * 2 / len(remotes)
|
|
||||||
}
|
|
||||||
if perPDSLimit < maxQueueLen/10 {
|
|
||||||
perPDSLimit = maxQueueLen / 10
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fake remote to account for repos we didn't have a PDS for yet.
|
|
||||||
remotes = append(remotes, pds.PDS{ID: pds.Unknown})
|
|
||||||
|
|
||||||
for _, remote := range remotes {
|
|
||||||
repos := []repo.Repo{}
|
|
||||||
|
|
||||||
err := s.db.Raw(`SELECT * FROM "repos" WHERE pds = ? AND (last_indexed_rev is null OR last_indexed_rev = '') AND failed_attempts < ?
|
|
||||||
UNION
|
|
||||||
SELECT "repos".* FROM "repos" left join "pds" on repos.pds = pds.id WHERE pds = ?
|
|
||||||
AND
|
|
||||||
(
|
|
||||||
(first_rev_since_reset is not null AND first_rev_since_reset <> ''
|
|
||||||
AND last_indexed_rev < first_rev_since_reset)
|
|
||||||
OR
|
|
||||||
("repos".first_cursor_since_reset is not null AND "repos".first_cursor_since_reset <> 0
|
|
||||||
AND "repos".first_cursor_since_reset < "pds".first_cursor_since_reset)
|
|
||||||
)
|
|
||||||
AND failed_attempts < ? LIMIT ?`,
|
|
||||||
remote.ID, maxAttempts, remote.ID, maxAttempts, perPDSLimit).
|
|
||||||
Scan(&repos).Error
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("querying DB: %w", err)
|
|
||||||
}
|
|
||||||
for _, r := range repos {
|
|
||||||
if s.queue[r.DID] != nil || s.inProgress[r.DID] != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
copied := r
|
|
||||||
s.queue[r.DID] = &copied
|
|
||||||
reposQueued.Inc()
|
|
||||||
}
|
|
||||||
s.updateQueueLenMetrics()
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Scheduler) updateQueueLenMetrics() {
|
|
||||||
queueLenght.WithLabelValues("queued").Set(float64(len(s.queue)))
|
|
||||||
queueLenght.WithLabelValues("inProgress").Set(float64(len(s.inProgress)))
|
|
||||||
}
|
|
|
@ -1,315 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/imax9000/errors"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
"gorm.io/gorm/clause"
|
|
||||||
|
|
||||||
comatproto "github.com/bluesky-social/indigo/api/atproto"
|
|
||||||
"github.com/bluesky-social/indigo/util"
|
|
||||||
"github.com/bluesky-social/indigo/xrpc"
|
|
||||||
|
|
||||||
"github.com/uabluerail/bsky-tools/xrpcauth"
|
|
||||||
"github.com/uabluerail/indexer/models"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/repo"
|
|
||||||
"github.com/uabluerail/indexer/util/fix"
|
|
||||||
"github.com/uabluerail/indexer/util/resolver"
|
|
||||||
)
|
|
||||||
|
|
||||||
type WorkItem struct {
|
|
||||||
Repo *repo.Repo
|
|
||||||
signal chan struct{}
|
|
||||||
}
|
|
||||||
|
|
||||||
type WorkerPool struct {
|
|
||||||
db *gorm.DB
|
|
||||||
input <-chan WorkItem
|
|
||||||
limiter *Limiter
|
|
||||||
|
|
||||||
workerSignals []chan struct{}
|
|
||||||
resize chan int
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewWorkerPool(input <-chan WorkItem, db *gorm.DB, size int, limiter *Limiter) *WorkerPool {
|
|
||||||
r := &WorkerPool{
|
|
||||||
db: db,
|
|
||||||
input: input,
|
|
||||||
limiter: limiter,
|
|
||||||
resize: make(chan int),
|
|
||||||
}
|
|
||||||
r.workerSignals = make([]chan struct{}, size)
|
|
||||||
for i := range r.workerSignals {
|
|
||||||
r.workerSignals[i] = make(chan struct{})
|
|
||||||
}
|
|
||||||
return r
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *WorkerPool) Start(ctx context.Context) error {
|
|
||||||
go p.run(ctx)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *WorkerPool) Resize(ctx context.Context, size int) error {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case p.resize <- size:
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *WorkerPool) run(ctx context.Context) {
|
|
||||||
for _, ch := range p.workerSignals {
|
|
||||||
go p.worker(ctx, ch)
|
|
||||||
}
|
|
||||||
workerPoolSize.Set(float64(len(p.workerSignals)))
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
for _, ch := range p.workerSignals {
|
|
||||||
close(ch)
|
|
||||||
}
|
|
||||||
// also wait for all workers to stop?
|
|
||||||
return
|
|
||||||
case newSize := <-p.resize:
|
|
||||||
switch {
|
|
||||||
case newSize > len(p.workerSignals):
|
|
||||||
ch := make([]chan struct{}, newSize-len(p.workerSignals))
|
|
||||||
for i := range ch {
|
|
||||||
ch[i] = make(chan struct{})
|
|
||||||
go p.worker(ctx, ch[i])
|
|
||||||
}
|
|
||||||
p.workerSignals = append(p.workerSignals, ch...)
|
|
||||||
workerPoolSize.Set(float64(len(p.workerSignals)))
|
|
||||||
case newSize < len(p.workerSignals) && newSize > 0:
|
|
||||||
for _, ch := range p.workerSignals[newSize:] {
|
|
||||||
close(ch)
|
|
||||||
}
|
|
||||||
p.workerSignals = p.workerSignals[:newSize]
|
|
||||||
workerPoolSize.Set(float64(len(p.workerSignals)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *WorkerPool) worker(ctx context.Context, signal chan struct{}) {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
case <-signal:
|
|
||||||
return
|
|
||||||
case work := <-p.input:
|
|
||||||
updates := &repo.Repo{}
|
|
||||||
if err := p.doWork(ctx, work); err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Work task %q failed: %s", work.Repo.DID, err)
|
|
||||||
updates.LastError = err.Error()
|
|
||||||
updates.FailedAttempts = work.Repo.FailedAttempts + 1
|
|
||||||
reposIndexed.WithLabelValues("false").Inc()
|
|
||||||
} else {
|
|
||||||
updates.FailedAttempts = 0
|
|
||||||
reposIndexed.WithLabelValues("true").Inc()
|
|
||||||
}
|
|
||||||
updates.LastIndexAttempt = time.Now()
|
|
||||||
err := p.db.Model(&repo.Repo{}).
|
|
||||||
Where(&repo.Repo{ID: work.Repo.ID}).
|
|
||||||
Select("last_error", "last_index_attempt", "failed_attempts").
|
|
||||||
Updates(updates).Error
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msgf("Failed to update repo info for %q: %s", work.Repo.DID, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *WorkerPool) doWork(ctx context.Context, work WorkItem) error {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
defer close(work.signal)
|
|
||||||
|
|
||||||
u, pubKey, err := resolver.GetPDSEndpointAndPublicKey(ctx, work.Repo.DID)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
remote, err := pds.EnsureExists(ctx, p.db, u.String())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to get PDS records for %q: %w", u, err)
|
|
||||||
}
|
|
||||||
if work.Repo.PDS != remote.ID {
|
|
||||||
if err := p.db.Model(&work.Repo).Where(&repo.Repo{ID: work.Repo.ID}).Updates(&repo.Repo{PDS: remote.ID}).Error; err != nil {
|
|
||||||
return fmt.Errorf("failed to update repo's PDS to %q: %w", u, err)
|
|
||||||
}
|
|
||||||
work.Repo.PDS = remote.ID
|
|
||||||
}
|
|
||||||
|
|
||||||
client := xrpcauth.NewAnonymousClient(ctx)
|
|
||||||
client.Host = u.String()
|
|
||||||
client.Client = util.RobustHTTPClient()
|
|
||||||
client.Client.Timeout = 30 * time.Minute
|
|
||||||
|
|
||||||
knownCursorBeforeFetch := remote.FirstCursorSinceReset
|
|
||||||
|
|
||||||
retry:
|
|
||||||
if p.limiter != nil {
|
|
||||||
if err := p.limiter.Wait(ctx, u.String()); err != nil {
|
|
||||||
return fmt.Errorf("failed to wait on rate limiter: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: add a configuration knob for switching between full and partial fetch.
|
|
||||||
sinceRev := work.Repo.LastIndexedRev
|
|
||||||
b, err := comatproto.SyncGetRepo(ctx, client, work.Repo.DID, sinceRev)
|
|
||||||
if err != nil {
|
|
||||||
if err, ok := errors.As[*xrpc.Error](err); ok {
|
|
||||||
if err.IsThrottled() && err.Ratelimit != nil {
|
|
||||||
log.Debug().Str("pds", u.String()).Msgf("Hit a rate limit (%s), sleeping until %s", err.Ratelimit.Policy, err.Ratelimit.Reset)
|
|
||||||
time.Sleep(time.Until(err.Ratelimit.Reset))
|
|
||||||
goto retry
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
reposFetched.WithLabelValues(u.String(), "false").Inc()
|
|
||||||
return fmt.Errorf("failed to fetch repo: %w", err)
|
|
||||||
}
|
|
||||||
if len(b) == 0 {
|
|
||||||
reposFetched.WithLabelValues(u.String(), "false").Inc()
|
|
||||||
return fmt.Errorf("PDS returned zero bytes")
|
|
||||||
}
|
|
||||||
reposFetched.WithLabelValues(u.String(), "true").Inc()
|
|
||||||
|
|
||||||
if work.Repo.PDS == pds.Unknown {
|
|
||||||
remote, err := pds.EnsureExists(ctx, p.db, u.String())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
work.Repo.PDS = remote.ID
|
|
||||||
if err := p.db.Model(&work.Repo).Where(&repo.Repo{ID: work.Repo.ID}).Updates(&repo.Repo{PDS: work.Repo.PDS}).Error; err != nil {
|
|
||||||
return fmt.Errorf("failed to set repo's PDS: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRev, err := repo.GetRev(ctx, bytes.NewReader(b))
|
|
||||||
if sinceRev != "" && errors.Is(err, repo.ErrZeroBlocks) {
|
|
||||||
// No new records since the rev we requested above.
|
|
||||||
if work.Repo.FirstCursorSinceReset < knownCursorBeforeFetch {
|
|
||||||
if err := p.bumpFirstCursorSinceReset(work.Repo.ID, knownCursorBeforeFetch); err != nil {
|
|
||||||
return fmt.Errorf("updating first_cursor_since_reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
} else if err != nil {
|
|
||||||
l := 25
|
|
||||||
if len(b) < l {
|
|
||||||
l = len(b)
|
|
||||||
}
|
|
||||||
log.Debug().Err(err).Msgf("Total bytes fetched: %d. First few bytes: %q", len(b), string(b[:l]))
|
|
||||||
return fmt.Errorf("failed to read 'rev' from the fetched repo: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
newRecs, err := repo.ExtractRecords(ctx, bytes.NewReader(b), pubKey)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to extract records: %w", err)
|
|
||||||
}
|
|
||||||
recs := []repo.Record{}
|
|
||||||
for k, v := range newRecs {
|
|
||||||
parts := strings.SplitN(k, "/", 2)
|
|
||||||
if len(parts) != 2 {
|
|
||||||
log.Warn().Msgf("Unexpected key format: %q", k)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
v = regexp.MustCompile(`[^\\](\\\\)*(\\u0000)`).ReplaceAll(v, []byte(`$1<0x00>`))
|
|
||||||
|
|
||||||
recs = append(recs, repo.Record{
|
|
||||||
Repo: models.ID(work.Repo.ID),
|
|
||||||
Collection: parts[0],
|
|
||||||
Rkey: parts[1],
|
|
||||||
// XXX: proper replacement of \u0000 would require full parsing of JSON
|
|
||||||
// and recursive iteration over all string values, but this
|
|
||||||
// should work well enough for now.
|
|
||||||
Content: fix.EscapeNullCharForPostgres(v),
|
|
||||||
AtRev: newRev,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
recordsFetched.Add(float64(len(recs)))
|
|
||||||
if len(recs) > 0 {
|
|
||||||
for _, batch := range splitInBatshes(recs, 500) {
|
|
||||||
result := p.db.Model(&repo.Record{}).
|
|
||||||
Clauses(clause.OnConflict{
|
|
||||||
Where: clause.Where{Exprs: []clause.Expression{
|
|
||||||
clause.Neq{
|
|
||||||
Column: clause.Column{Name: "content", Table: "records"},
|
|
||||||
Value: clause.Column{Name: "content", Table: "excluded"}},
|
|
||||||
clause.Or(
|
|
||||||
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: nil},
|
|
||||||
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: ""},
|
|
||||||
clause.Lt{
|
|
||||||
Column: clause.Column{Name: "at_rev", Table: "records"},
|
|
||||||
Value: clause.Column{Name: "at_rev", Table: "excluded"}},
|
|
||||||
)}},
|
|
||||||
DoUpdates: clause.AssignmentColumns([]string{"content", "at_rev"}),
|
|
||||||
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
|
|
||||||
Create(batch)
|
|
||||||
if err := result.Error; err != nil {
|
|
||||||
return fmt.Errorf("inserting records into the database: %w", err)
|
|
||||||
}
|
|
||||||
recordsInserted.Add(float64(result.RowsAffected))
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = p.db.Model(&repo.Repo{}).Where(&repo.Repo{ID: work.Repo.ID}).
|
|
||||||
Updates(&repo.Repo{LastIndexedRev: newRev}).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("updating repo rev: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if work.Repo.FirstCursorSinceReset < knownCursorBeforeFetch {
|
|
||||||
if err := p.bumpFirstCursorSinceReset(work.Repo.ID, knownCursorBeforeFetch); err != nil {
|
|
||||||
return fmt.Errorf("updating first_cursor_since_reset: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// TODO: check for records that are missing in the repo download
|
|
||||||
// and mark them as deleted.
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// bumpFirstCursorSinceReset increases repo's FirstCursorSinceReset iff it is currently lower than the supplied value.
|
|
||||||
func (p *WorkerPool) bumpFirstCursorSinceReset(repoId models.ID, cursorValue int64) error {
|
|
||||||
return p.db.Transaction(func(tx *gorm.DB) error {
|
|
||||||
var currentCursor int64
|
|
||||||
err := tx.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoId}).
|
|
||||||
Select("first_cursor_since_reset").First(¤tCursor).Error
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to get current cursor value: %w", err)
|
|
||||||
}
|
|
||||||
if currentCursor < cursorValue {
|
|
||||||
return tx.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoId}).
|
|
||||||
Updates(&repo.Repo{FirstCursorSinceReset: cursorValue}).Error
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func splitInBatshes[T any](s []T, batchSize int) [][]T {
|
|
||||||
var r [][]T
|
|
||||||
for i := 0; i < len(s); i += batchSize {
|
|
||||||
if i+batchSize < len(s) {
|
|
||||||
r = append(r, s[i:i+batchSize])
|
|
||||||
} else {
|
|
||||||
r = append(r, s[i:])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return r
|
|
||||||
}
|
|
|
@ -1,14 +0,0 @@
|
||||||
FROM golang:1.22.3 as builder
|
|
||||||
WORKDIR /app
|
|
||||||
COPY go.mod go.sum ./
|
|
||||||
RUN go mod download
|
|
||||||
COPY . ./
|
|
||||||
RUN go build -trimpath ./cmd/update-db-schema
|
|
||||||
|
|
||||||
FROM alpine:latest as certs
|
|
||||||
RUN apk --update add ca-certificates
|
|
||||||
|
|
||||||
FROM debian:stable-slim
|
|
||||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
|
||||||
COPY --from=builder /app/update-db-schema .
|
|
||||||
ENTRYPOINT ["./update-db-schema"]
|
|
|
@ -1,7 +0,0 @@
|
||||||
*
|
|
||||||
**/*
|
|
||||||
!go.mod
|
|
||||||
!go.sum
|
|
||||||
!**/*.go
|
|
||||||
cmd/**
|
|
||||||
!cmd/update-db-schema
|
|
|
@ -1,155 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
_ "net/http/pprof"
|
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"runtime/debug"
|
|
||||||
"strings"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
_ "github.com/joho/godotenv/autoload"
|
|
||||||
"github.com/kelseyhightower/envconfig"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gorm.io/driver/postgres"
|
|
||||||
"gorm.io/gorm"
|
|
||||||
"gorm.io/gorm/logger"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/repo"
|
|
||||||
"github.com/uabluerail/indexer/util/gormzerolog"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Config struct {
|
|
||||||
LogFile string
|
|
||||||
LogFormat string `default:"text"`
|
|
||||||
LogLevel int64 `default:"1"`
|
|
||||||
DBUrl string `envconfig:"POSTGRES_URL"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var config Config
|
|
||||||
|
|
||||||
func runMain(ctx context.Context) error {
|
|
||||||
ctx = setupLogging(ctx)
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
log.Debug().Msgf("Starting up...")
|
|
||||||
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
|
|
||||||
Logger: gormzerolog.New(&logger.Config{
|
|
||||||
SlowThreshold: 1 * time.Second,
|
|
||||||
IgnoreRecordNotFoundError: true,
|
|
||||||
}, nil),
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("connecting to the database: %w", err)
|
|
||||||
}
|
|
||||||
log.Debug().Msgf("DB connection established")
|
|
||||||
|
|
||||||
for _, f := range []func(*gorm.DB) error{
|
|
||||||
pds.AutoMigrate,
|
|
||||||
repo.AutoMigrate,
|
|
||||||
} {
|
|
||||||
if err := f(db); err != nil {
|
|
||||||
return fmt.Errorf("auto-migrating DB schema: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.Debug().Msgf("DB schema updated")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
|
|
||||||
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
|
|
||||||
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
|
|
||||||
|
|
||||||
if err := envconfig.Process("update-db-schema", &config); err != nil {
|
|
||||||
log.Fatalf("envconfig.Process: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
||||||
if err := runMain(ctx); err != nil {
|
|
||||||
fmt.Fprintln(os.Stderr, err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func setupLogging(ctx context.Context) context.Context {
|
|
||||||
logFile := os.Stderr
|
|
||||||
|
|
||||||
if config.LogFile != "" {
|
|
||||||
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
|
|
||||||
}
|
|
||||||
logFile = f
|
|
||||||
}
|
|
||||||
|
|
||||||
var output io.Writer
|
|
||||||
|
|
||||||
switch config.LogFormat {
|
|
||||||
case "json":
|
|
||||||
output = logFile
|
|
||||||
case "text":
|
|
||||||
prefixList := []string{}
|
|
||||||
info, ok := debug.ReadBuildInfo()
|
|
||||||
if ok {
|
|
||||||
prefixList = append(prefixList, info.Path+"/")
|
|
||||||
}
|
|
||||||
|
|
||||||
basedir := ""
|
|
||||||
_, sourceFile, _, ok := runtime.Caller(0)
|
|
||||||
if ok {
|
|
||||||
basedir = filepath.Dir(sourceFile)
|
|
||||||
}
|
|
||||||
|
|
||||||
if basedir != "" && strings.HasPrefix(basedir, "/") {
|
|
||||||
prefixList = append(prefixList, basedir+"/")
|
|
||||||
head, _ := filepath.Split(basedir)
|
|
||||||
for head != "/" {
|
|
||||||
prefixList = append(prefixList, head)
|
|
||||||
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
output = zerolog.ConsoleWriter{
|
|
||||||
Out: logFile,
|
|
||||||
NoColor: true,
|
|
||||||
TimeFormat: time.RFC3339,
|
|
||||||
PartsOrder: []string{
|
|
||||||
zerolog.LevelFieldName,
|
|
||||||
zerolog.TimestampFieldName,
|
|
||||||
zerolog.CallerFieldName,
|
|
||||||
zerolog.MessageFieldName,
|
|
||||||
},
|
|
||||||
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
|
|
||||||
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
|
|
||||||
FormatCaller: func(i interface{}) string {
|
|
||||||
s := i.(string)
|
|
||||||
for _, p := range prefixList {
|
|
||||||
s = strings.TrimPrefix(s, p)
|
|
||||||
}
|
|
||||||
return s
|
|
||||||
},
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
|
|
||||||
|
|
||||||
ctx = logger.WithContext(ctx)
|
|
||||||
|
|
||||||
zerolog.DefaultContextLogger = &logger
|
|
||||||
log.SetOutput(logger)
|
|
||||||
|
|
||||||
return ctx
|
|
||||||
}
|
|
|
@ -1,320 +0,0 @@
|
||||||
# Data consistency model
|
|
||||||
|
|
||||||
## Indicators received from upstream
|
|
||||||
|
|
||||||
We have two interconnected strictly ordered values: `rev` and cursor. `rev` is
|
|
||||||
local to each repo, cursor provides additional ordering across all repos hosted
|
|
||||||
on a PDS.
|
|
||||||
|
|
||||||
### `rev`
|
|
||||||
|
|
||||||
String value, sequencing each commit within a given repo. Each next commit must
|
|
||||||
have a `rev` value strictly greater than the previous commit.
|
|
||||||
|
|
||||||
### Cursor
|
|
||||||
|
|
||||||
Integer number, sent with each message in firehose. Must be strictly increasing.
|
|
||||||
Messages also contain `rev` value for the corresponding repo event, and we
|
|
||||||
assume that within each repo all commits with smaller `rev` values also were
|
|
||||||
sent with smaller cursor values. That is, cursor sequences all events recorded
|
|
||||||
by the PDS and we assume that events of each given repo are sent in proper
|
|
||||||
order.
|
|
||||||
|
|
||||||
#### Cursor reset
|
|
||||||
|
|
||||||
"Cursor reset" is a situation where upon reconnecting to a PDS we find out that
|
|
||||||
the PDS is unable to send us all events that happened since the cursor value we
|
|
||||||
have recorded. It is **Very Bad**™, because we have no idea what events did we
|
|
||||||
miss between our recorded cursor and the new cursor that PDS has sent us.
|
|
||||||
|
|
||||||
This gap in data from a PDS must be addressed somehow, and most of this document
|
|
||||||
revolves around detecting when a given repo is affected by a cursor reset and
|
|
||||||
how to recover missing data with minimal effort.
|
|
||||||
|
|
||||||
## Available operations
|
|
||||||
|
|
||||||
### Repo fetch
|
|
||||||
|
|
||||||
We can fetch a full copy of a repo. Each commit contains a `rev` - string value
|
|
||||||
that is strictly increasing with each new commit.
|
|
||||||
|
|
||||||
We also have the option to only fetch records created after a particular `rev` -
|
|
||||||
this is useful for reducing the amount of data received when we already have
|
|
||||||
some of the records.
|
|
||||||
|
|
||||||
### Consuming firehose
|
|
||||||
|
|
||||||
We can stream new events from each PDS. Every event comes with a cursor value -
|
|
||||||
integer number that is strictly increasing, scoped to a PDS. Events also contain
|
|
||||||
repo-specific `rev` which is the same with a full repo fetch.
|
|
||||||
|
|
||||||
## High-level overview
|
|
||||||
|
|
||||||
With `rev` imposing strict ordering on repo operations, we maintain the
|
|
||||||
following two indicators for each repo:
|
|
||||||
|
|
||||||
1. `LastCompleteRev` - largest `rev` value that we are sure we have the complete
|
|
||||||
set of records at. For example, we can set this after processing the output
|
|
||||||
of `getRepo` call.
|
|
||||||
2. `FirstUninterruptedFirehoseRev` - smallest `rev` value from which we are sure
|
|
||||||
to have a complete set of records up until ~now.
|
|
||||||
|
|
||||||
These indicators define two intervals of `rev` values (`(-Infinity,
|
|
||||||
LastCompleteRev]`, `[FirstUninterruptedFirehoseRev, now)`) that we assume to
|
|
||||||
have already processed. If these intervals overlap - we assume that we've
|
|
||||||
covered `(-Infinity, now)`, i.e., have a complete set of records of a given
|
|
||||||
repo. If they don't overlap - we might have missed some records, and can
|
|
||||||
remediate that by fetching the whole repo, indexing records we don't have and
|
|
||||||
updating `LastCompleteRev`.
|
|
||||||
|
|
||||||
Both of these indicators should never decrease. When a PDS tells us that our
|
|
||||||
cursor value is invalid, we move `FirstUninterruptedFirehoseRev` forward, which
|
|
||||||
in turn can make the above intervals non-overlapping.
|
|
||||||
|
|
||||||
These indicators also can be uninitialized, which means that we have no data
|
|
||||||
about the corresponding interval.
|
|
||||||
|
|
||||||
Note that for performance and feasibility reasons we don't store these two
|
|
||||||
indicators in the database directly. Instead, to minimize the number of writes,
|
|
||||||
we derive them from a few other values.
|
|
||||||
|
|
||||||
### Updating `LastCompleteRev`
|
|
||||||
|
|
||||||
We can move `LastCompleteRev` forward when either:
|
|
||||||
|
|
||||||
* We just indexed a full repo checkout
|
|
||||||
* We got a new record from firehose AND the repo currently has no gaps
|
|
||||||
(`LastCompleteRev` >= `FirstUninterruptedFirehoseRev`)
|
|
||||||
|
|
||||||
### Updating `FirstUninterruptedFirehoseRev`
|
|
||||||
|
|
||||||
Once initialized, stays constant during normal operation. Can move forward if a
|
|
||||||
PDS informs us that we missed some records and it can't replay all of them (and
|
|
||||||
resets our cursor).
|
|
||||||
|
|
||||||
## Handling cursor resets
|
|
||||||
|
|
||||||
### Naive approach
|
|
||||||
|
|
||||||
We could store `FirstUninterruptedFirehoseRev` in a column for each repo, and
|
|
||||||
when we detect a cursor reset - unset it for every repo from a particular PDS.
|
|
||||||
|
|
||||||
There are a couple of issues with this:
|
|
||||||
|
|
||||||
1. Cursor reset will trigger a lot of writes: row for each repo from the
|
|
||||||
affected PDS will have to be updated.
|
|
||||||
2. We have no information about `[FirstUninterruptedFirehoseRev, now)` interval
|
|
||||||
until we see a new commit for a repo, which might take a long time, or never
|
|
||||||
happen at all.
|
|
||||||
|
|
||||||
### Reducing the number of writes
|
|
||||||
|
|
||||||
We can rely on the firehose cursor value imposing additional ordering on
|
|
||||||
commits.
|
|
||||||
|
|
||||||
1. Start tracking firehose stream continuity by storing
|
|
||||||
`FirstUninterruptedCursor` for each PDS
|
|
||||||
2. When receiving a commit from firehose, compare `FirstUninterruptedCursor`
|
|
||||||
between repo and PDS entries:
|
|
||||||
* If `Repo`.`FirstUninterruptedCursor` < `PDS`.`FirstUninterruptedCursor`,
|
|
||||||
set `FirstUninterruptedFirehoseRev` to the commit's `rev` and copy
|
|
||||||
`FirstUninterruptedCursor` from PDS entry.
|
|
||||||
|
|
||||||
Now during a cursor reset we need to only change `FirstUninterruptedCursor` in
|
|
||||||
the PDS entry. And if `Repo`.`FirstUninterruptedCursor` <
|
|
||||||
`PDS`.`FirstUninterruptedCursor` - we know that repo's hosting PDS reset our
|
|
||||||
cursor at some point and `FirstUninterruptedFirehoseRev` value is no longer
|
|
||||||
valid.
|
|
||||||
|
|
||||||
### Avoiding long wait for the first firehose event
|
|
||||||
|
|
||||||
We can fetch the full repo to index any missing records and advance
|
|
||||||
`LastCompleteRev` accordingly. But if we don't update
|
|
||||||
`Repo`.`FirstUninterruptedCursor` - it will stay smaller than
|
|
||||||
`PDS`.`FirstUninterruptedCursor` and `FirstUninterruptedFirehoseRev` will remain
|
|
||||||
invalid.
|
|
||||||
|
|
||||||
We can fix that with an additional assumption: PDS provides strong consistency
|
|
||||||
between the firehose and `getRepo` - if we have already seen cursor value `X`,
|
|
||||||
then `getRepo` response will be up to date with all commits corresponding to
|
|
||||||
cursor values smaller or equal to `X`.
|
|
||||||
|
|
||||||
1. Before fetching the repo, note the current `FirstUninterruptedCursor` value
|
|
||||||
of the repo's hosting PDS. (Or even the latest `Cursor` value)
|
|
||||||
2. Fetch and process the full repo checkout, setting `LastCompleteRev`
|
|
||||||
3. If `Repo`.`FirstUninterruptedCursor` < `PDS`.`FirstUninterruptedCursor` still
|
|
||||||
holds (i.e., no new records on firehose while we were re-indexing), then set
|
|
||||||
`Repo`.`FirstUninterruptedCursor` to the cursor value recorded in step 1.
|
|
||||||
With the above assumption, all records that happened between
|
|
||||||
`FirstUninterruptedFirehoseRev` and this cursor value were already processed
|
|
||||||
in step 2, so `FirstUninterruptedFirehoseRev` is again valid, until
|
|
||||||
`PDS`.`FirstUninterruptedCursor` moves forward again.
|
|
||||||
|
|
||||||
## Repo discovery
|
|
||||||
|
|
||||||
We have the ability to get a complete list of hosted repos from a PDS. The
|
|
||||||
response includes last known `rev` for each repo, but does not come attached
|
|
||||||
with a firehose cursor value. We're assuming here the same level of consistency
|
|
||||||
as with `getRepo`, and can initialize `Repo`.`FirstUninterruptedCursor` with the
|
|
||||||
value from the PDS entry recorded before making the call to list repos, and
|
|
||||||
`FirstUninterruptedFirehoseRev` to the returned `rev`.
|
|
||||||
|
|
||||||
TODO: consider if it's worth to not touch cursor/`rev` values here and offload
|
|
||||||
initializing them to indexing step described above.
|
|
||||||
|
|
||||||
## Updating `LastCompleteRev` based on firehose events
|
|
||||||
|
|
||||||
We have the option to only advance `LastCompleteRev` when processing the full
|
|
||||||
repo checkout. While completely valid, it's rather pessimistic in that, in
|
|
||||||
absence of cursor resets, this value will remain arbitrarily old despite us
|
|
||||||
actually having a complete set of records for the repo. Consequently, when a
|
|
||||||
cursor reset eventually does happen - we'll be assuming that we're missing much
|
|
||||||
more records than we actually do.
|
|
||||||
|
|
||||||
Naively, we can simply update `LastCompleteRev` on every event (iff the
|
|
||||||
completeness intervals are currently overlapping). The drawback is that each
|
|
||||||
event, in addition to new record creation, will update the corresponding repo
|
|
||||||
entry. If we could avoid this, it would considerably reduce the number of
|
|
||||||
writes.
|
|
||||||
|
|
||||||
### Alternative 1: delay updates
|
|
||||||
|
|
||||||
We can delay updating `LastCompleteRev` from firehose events for some time and
|
|
||||||
elide multiple updates to the same repo into a single write. Delay duration
|
|
||||||
would have to be at least on the order of minutes for this to be effective,
|
|
||||||
since writes to any single repo are usually initiated by human actions and have
|
|
||||||
a very low rate.
|
|
||||||
|
|
||||||
This way we can trade some RAM for reduction in writes.
|
|
||||||
|
|
||||||
### Alternative 2: skip frequent updates
|
|
||||||
|
|
||||||
Similar to the above, but instead of delaying updates, simply skip them if last
|
|
||||||
update was recent enough. This will often result in `LastCompleteRev` not
|
|
||||||
reflecting *actual* last complete `rev` for a repo, but it will keep it recent
|
|
||||||
enough.
|
|
||||||
|
|
||||||
## Detailed design
|
|
||||||
|
|
||||||
### Bad naming
|
|
||||||
|
|
||||||
In the implementation not enough attention was paid to naming things, and their
|
|
||||||
usage and meaning slightly changed over time, so in the sections below and in
|
|
||||||
the code some of the things mentioned above are named differently:
|
|
||||||
|
|
||||||
* `LastCompleteRev` - max(`LastIndexedRev`, `LastFirehoseRev`)
|
|
||||||
* `FirstUninterruptedCursor` - `FirstCursorSinceReset`
|
|
||||||
* `FirstUninterruptedFirehoseRev` - `FirstRevSinceReset`
|
|
||||||
|
|
||||||
### Metadata fields
|
|
||||||
|
|
||||||
#### PDS
|
|
||||||
|
|
||||||
* `Cursor` - last cursor value received from this PDS.
|
|
||||||
* `FirstCursorSinceReset` - earliest cursor we have uninterrupted sequence of
|
|
||||||
records up to now.
|
|
||||||
|
|
||||||
#### Repo
|
|
||||||
|
|
||||||
* `LastIndexedRev` - last `rev` recorded during most recent full repo re-index
|
|
||||||
* Up to this `rev` we do have all records
|
|
||||||
* `FirstRevSinceReset` - first `rev` seen on firehose since the most recent
|
|
||||||
cursor reset.
|
|
||||||
* Changes only when an event for this repo is received, so it alone doesn't
|
|
||||||
guarantee that we have all subsequent records
|
|
||||||
* `FirstCursorSinceReset` - copy of the PDS field with the same name.
|
|
||||||
* If `FirstCursorSinceReset` >= `PDS`.`FirstCursorSinceReset` and PDS's
|
|
||||||
firehose is live - then we indeed have all records since
|
|
||||||
`FirstRevSinceReset`
|
|
||||||
* `LastFirehoseRev` - last `rev` seen on the firehose while we didn't have any
|
|
||||||
interruptions
|
|
||||||
|
|
||||||
### Guarantees
|
|
||||||
|
|
||||||
* Up to and including `LastIndexedRev` - all records have been indexed.
|
|
||||||
* If `LastFirehoseRev` is set - all records up to and including it have been
|
|
||||||
indexed.
|
|
||||||
|
|
||||||
* If `FirstCursorSinceReset` >= `PDS`.`FirstCursorSinceReset`:
|
|
||||||
* Starting from and including `FirstRevSinceReset` - we have indexed all newer
|
|
||||||
records
|
|
||||||
* Consequently, if max(`LastIndexedRev`, `LastFirehoseRev`) >=
|
|
||||||
`FirstRevSinceReset` - we have a complete copy of the repo
|
|
||||||
|
|
||||||
* If `FirstCursorSinceReset` < `PDS`.`FirstCursorSinceReset`:
|
|
||||||
* There was a cursor reset, we might be missing some records after
|
|
||||||
`FirstRevSinceReset`
|
|
||||||
|
|
||||||
* `FirstCursorSinceReset` on both repos and PDSs never gets rolled back
|
|
||||||
* `LastIndexedRev` never gets rolled back
|
|
||||||
|
|
||||||
### Operations
|
|
||||||
|
|
||||||
#### Indexing a repo
|
|
||||||
|
|
||||||
* Resolve the current PDS hosting the repo and store its `FirstCursorSinceReset`
|
|
||||||
in a variable
|
|
||||||
* If the PDS is different from the one we have on record (i.e., the repo
|
|
||||||
migrated) - update accordingly
|
|
||||||
* Fetch the repo
|
|
||||||
* Upsert all fetched records
|
|
||||||
* Set `LastIndexedRev` to `rev` of the fetched repo
|
|
||||||
* In a transaction check if `Repo`.`FirstCursorSinceReset` >= the value stored
|
|
||||||
in the first step, and set it to that value if it isn't.
|
|
||||||
* Assumption here is that a PDS returns strongly consistent responses for a
|
|
||||||
single repo, and fetching the repo will include all records corresponding to
|
|
||||||
a cursor value generated before that.
|
|
||||||
|
|
||||||
#### Connecting to firehose
|
|
||||||
|
|
||||||
* If the first message is `#info` - this means that our cursor is too old
|
|
||||||
* Update PDS's `FirstCursorSinceReset` to the value supplied in the `#info`
|
|
||||||
message
|
|
||||||
|
|
||||||
Workaround for a buggy relay that doesn't send `#info`:
|
|
||||||
|
|
||||||
* If the first message has cursor value that is different from `Cursor`+1:
|
|
||||||
* Assume there was a cursor reset and update PDS's `FirstCursorSinceReset` to
|
|
||||||
the value provided in the message
|
|
||||||
|
|
||||||
#### Receiving event on firehose
|
|
||||||
|
|
||||||
* Check that the event is coming from the correct PDS for a given repo
|
|
||||||
* TODO: maybe drop this and just check the signature
|
|
||||||
* Process the event normally
|
|
||||||
* If `Repo`.`FirstCursorSinceReset` >= `PDS`.`FirstCursorSinceReset`:
|
|
||||||
* Update `LastFirehoseRev` to event's `rev`
|
|
||||||
* If `Repo`.`FirstCursorSinceReset` < `PDS`.`FirstCursorSinceReset`:
|
|
||||||
* Set repo's `FirstRevSinceReset` to the event's `rev` and
|
|
||||||
`FirstCursorSinceReset` to `PDS`.`FirstCursorSinceReset`
|
|
||||||
|
|
||||||
* If `tooBig` flag is set on the message (MST diff was larger than PDS's size
|
|
||||||
limit, so some records were dropped):
|
|
||||||
* Set repo's `FirstRevSinceReset` to the event's `rev` and
|
|
||||||
`FirstCursorSinceReset` to `PDS`.`FirstCursorSinceReset`
|
|
||||||
* Note: `FirstCursorSinceReset` might be the same, but moving forward
|
|
||||||
`FirstRevSinceReset` likely will trigger repo reindexing
|
|
||||||
|
|
||||||
* Update PDS's `Cursor` to the value provided in the message
|
|
||||||
|
|
||||||
#### Listing repos
|
|
||||||
|
|
||||||
* Fetch a list of repos from a PDS. Response also includes the last `rev` for
|
|
||||||
every repo.
|
|
||||||
* For each repo:
|
|
||||||
* If `FirstRevSinceReset` is not set:
|
|
||||||
* Set `FirstRevSinceReset` to received `rev`
|
|
||||||
* Set `FirstCursorSinceReset` to the PDS's `FirstCursorSinceReset`
|
|
||||||
|
|
||||||
#### Repo migrating to a different PDS
|
|
||||||
|
|
||||||
TODO
|
|
||||||
|
|
||||||
Currently we're simply resetting `FirstRevSinceReset`.
|
|
||||||
|
|
||||||
#### Finding repos that need indexing
|
|
||||||
|
|
||||||
* Repo index is incomplete and needs to be indexed if one of these is true:
|
|
||||||
* `LastIndexedRev` is not set
|
|
||||||
* max(`LastIndexedRev`, `LastFirehoseRev`) < `FirstRevSinceReset`
|
|
||||||
* `Repo`.`FirstCursorSinceReset` < `PDS`.`FirstCursorSinceReset`
|
|
|
@ -1,62 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# ------------------------------ Write data timestamp ----------------------------------
|
|
||||||
|
|
||||||
echo "export_start" > timestamp.csv
|
|
||||||
date -Iseconds --utc >> timestamp.csv
|
|
||||||
|
|
||||||
# ------------------------------ Refresh views ----------------------------------
|
|
||||||
|
|
||||||
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
|
||||||
\timing
|
|
||||||
\echo Refreshing follows...
|
|
||||||
refresh materialized view export_follows;
|
|
||||||
\echo Refreshing like counts...
|
|
||||||
refresh materialized view export_likes;
|
|
||||||
\echo Refreshing reply counts...
|
|
||||||
refresh materialized view export_replies;
|
|
||||||
\echo Refreshing block list...
|
|
||||||
refresh materialized view export_blocks;
|
|
||||||
\echo Refreshing DID list...
|
|
||||||
refresh materialized view export_dids;
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# ------------------------------ Dump views into .csv ----------------------------------
|
|
||||||
|
|
||||||
echo "Writing .csv files..."
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_follows) to stdout with csv header;" > follows.csv
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_likes) to stdout with csv header;" > like_counts.csv
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_replies) to stdout with csv header;" > post_counts.csv
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_blocks) to stdout with csv header;" > blocks.csv
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_dids) to stdout with csv header;" > dids.csv
|
|
||||||
|
|
||||||
# ------------------------------ Free up space used by materialized views ----------------------------------
|
|
||||||
|
|
||||||
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
|
||||||
\timing
|
|
||||||
refresh materialized view export_follows with no data;
|
|
||||||
refresh materialized view export_likes with no data;
|
|
||||||
refresh materialized view export_replies with no data;
|
|
||||||
refresh materialized view export_blocks with no data;
|
|
||||||
refresh materialized view export_dids with no data;
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# ------------------------------ Dump handles from plc-mirror ----------------------------------
|
|
||||||
|
|
||||||
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > handles.csv
|
|
||||||
\timing
|
|
||||||
select did as "did:ID", replace(operation['alsoKnownAs'] ->> 0, 'at://', '') as handle
|
|
||||||
from plc_log_entries
|
|
||||||
where (did, plc_timestamp) in (
|
|
||||||
select did, max(plc_timestamp) as plc_timestamp from plc_log_entries
|
|
||||||
where not nullified
|
|
||||||
group by did
|
|
||||||
)
|
|
||||||
EOF
|
|
120
csv_iexport.sh
120
csv_iexport.sh
|
@ -1,120 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
source .env
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# ------------------------------ Write data timestamp ----------------------------------
|
|
||||||
|
|
||||||
date=$(date -Idate --utc)
|
|
||||||
|
|
||||||
mkdir -p ${CSV_DIR}/full
|
|
||||||
mkdir -p ${CSV_DIR}/full/${date}
|
|
||||||
|
|
||||||
echo "Output directory: ${CSV_DIR}/full/${date}"
|
|
||||||
|
|
||||||
to_timestamp=$(date -Iseconds --utc)
|
|
||||||
|
|
||||||
echo "export_start" > ${CSV_DIR}/full/${date}/timestamp.csv
|
|
||||||
echo "${to_timestamp}" >> ${CSV_DIR}/full/${date}/timestamp.csv
|
|
||||||
|
|
||||||
# ------------------------------ Refresh views ----------------------------------
|
|
||||||
|
|
||||||
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
|
||||||
\timing
|
|
||||||
\echo Refreshing follows...
|
|
||||||
refresh materialized view export_follows;
|
|
||||||
\echo Refreshing like counts...
|
|
||||||
refresh materialized view export_likes_ladder;
|
|
||||||
\echo Refreshing reply counts...
|
|
||||||
refresh materialized view export_replies_ladder;
|
|
||||||
\echo Refreshing block list...
|
|
||||||
refresh materialized view export_blocks;
|
|
||||||
\echo Refreshing DID list...
|
|
||||||
refresh materialized view export_dids_ladder;
|
|
||||||
\echo Refreshing optout list...
|
|
||||||
refresh materialized view export_optouts;
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# ------------------------------ Dump views into .csv ----------------------------------
|
|
||||||
|
|
||||||
echo "Writing .csv files..."
|
|
||||||
|
|
||||||
echo "Starting follows export..."
|
|
||||||
folows_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_follows) to stdout with csv header;" > ${CSV_DIR}/full/${date}/follows.csv
|
|
||||||
echo "Finishing follows export..."
|
|
||||||
folows_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$folows_finished' where started='$folows_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.follow'"
|
|
||||||
|
|
||||||
echo "Starting blocks export..."
|
|
||||||
block_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_blocks) to stdout with csv header;" > ${CSV_DIR}/full/${date}/blocks.csv
|
|
||||||
echo "Finishing blocks export..."
|
|
||||||
block_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$block_finished' where started='$block_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.block'"
|
|
||||||
|
|
||||||
|
|
||||||
echo "Starting likes export..."
|
|
||||||
likes_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_likes_ladder) to stdout with csv header;" > ${CSV_DIR}/full/${date}/like_counts.csv
|
|
||||||
echo "Finishing likes export..."
|
|
||||||
likes_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$likes_finished' where started='$likes_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.like'"
|
|
||||||
|
|
||||||
echo "Starting posts export..."
|
|
||||||
posts_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_replies_ladder) to stdout with csv header;" > ${CSV_DIR}/full/${date}/post_counts.csv
|
|
||||||
echo "Finishing posts export..."
|
|
||||||
posts_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$posts_finished' where started='$posts_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.post'"
|
|
||||||
|
|
||||||
echo "Starting dids export..."
|
|
||||||
dids_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_dids_ladder) to stdout with csv header;" > ${CSV_DIR}/full/${date}/dids.csv
|
|
||||||
echo "Finishing dids export..."
|
|
||||||
dids_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$dids_finished' where started='$dids_started' and to_tsmp='$to_timestamp' and collection = 'did'"
|
|
||||||
|
|
||||||
echo "Starting optouts export..."
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/full/${date}/optout.csv
|
|
||||||
echo "Finishing optouts export..."
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------ DO NOT Free up space used by materialized views for incremental refresh ----------------------------------
|
|
||||||
|
|
||||||
# ------------------------------ Dump handles from plc-mirror ----------------------------------
|
|
||||||
|
|
||||||
echo "Starting handles export..."
|
|
||||||
handles_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle')"
|
|
||||||
docker exec -t plc-postgres-1 psql -U postgres -d plc \
|
|
||||||
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/full/${date}/handles.csv
|
|
||||||
echo "Finishing handles export..."
|
|
||||||
handles_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle'"
|
|
||||||
|
|
||||||
echo "Export finished."
|
|
||||||
|
|
|
@ -1,118 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
source .env
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# ------------------------------ Write data timestamp ----------------------------------
|
|
||||||
|
|
||||||
date=$(date -Idate --utc)
|
|
||||||
|
|
||||||
mkdir -p ${CSV_DIR}/monthly
|
|
||||||
mkdir -p ${CSV_DIR}/monthly/${date}
|
|
||||||
|
|
||||||
echo "Output directory: ${CSV_DIR}/monthly/${date}"
|
|
||||||
|
|
||||||
to_timestamp=$(date -Iseconds --utc)
|
|
||||||
echo "export_start" > ${CSV_DIR}/monthly/${date}/timestamp.csv
|
|
||||||
echo "${to_timestamp}" >> ${CSV_DIR}/monthly/${date}/timestamp.csv
|
|
||||||
|
|
||||||
# ------------------------------ Refresh views ----------------------------------
|
|
||||||
|
|
||||||
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
|
||||||
\timing
|
|
||||||
\echo Refreshing follows...
|
|
||||||
refresh materialized view export_follows_month;
|
|
||||||
\echo Refreshing like counts...
|
|
||||||
refresh materialized view export_likes_month;
|
|
||||||
\echo Refreshing reply counts...
|
|
||||||
refresh materialized view export_replies_month;
|
|
||||||
\echo Refreshing block list...
|
|
||||||
refresh materialized view export_blocks_month;
|
|
||||||
\echo Refreshing DID list...
|
|
||||||
refresh materialized view export_dids_month;
|
|
||||||
\echo Refreshing optout list...
|
|
||||||
refresh materialized view export_optouts;
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# ------------------------------ Dump views into .csv ----------------------------------
|
|
||||||
|
|
||||||
echo "Writing .csv files..."
|
|
||||||
|
|
||||||
echo "Starting follows export..."
|
|
||||||
folows_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow_month')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_follows_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/follows.csv
|
|
||||||
echo "Finishing follows export..."
|
|
||||||
folows_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$folows_finished' where started='$folows_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.follow_month'"
|
|
||||||
|
|
||||||
echo "Starting blocks export..."
|
|
||||||
block_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block_month')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_blocks_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/blocks.csv
|
|
||||||
echo "Finishing blocks export..."
|
|
||||||
block_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$block_finished' where started='$block_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.block_month'"
|
|
||||||
|
|
||||||
|
|
||||||
echo "Starting likes export..."
|
|
||||||
likes_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like_month')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_likes_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/like_counts.csv
|
|
||||||
echo "Finishing likes export..."
|
|
||||||
likes_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$likes_finished' where started='$likes_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.like_month'"
|
|
||||||
|
|
||||||
echo "Starting posts export..."
|
|
||||||
posts_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post_month')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_replies_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/post_counts.csv
|
|
||||||
echo "Finishing posts export..."
|
|
||||||
posts_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$posts_finished' where started='$posts_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.post_month'"
|
|
||||||
|
|
||||||
echo "Starting dids export..."
|
|
||||||
dids_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did_month')"
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select * from export_dids_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/dids.csv
|
|
||||||
echo "Finishing dids export..."
|
|
||||||
dids_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$dids_finished' where started='$dids_started' and to_tsmp='$to_timestamp' and collection = 'did_month'"
|
|
||||||
|
|
||||||
echo "Starting optouts export..."
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/optout.csv
|
|
||||||
echo "Finishing optouts export..."
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------ DO NOT Free up space used by materialized views for incremental refresh ----------------------------------
|
|
||||||
|
|
||||||
# ------------------------------ Dump handles from plc-mirror ----------------------------------
|
|
||||||
|
|
||||||
echo "Starting handles export..."
|
|
||||||
handles_started=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle_month')"
|
|
||||||
docker exec -t plc-postgres-1 psql -U postgres -d plc \
|
|
||||||
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/monthly/${date}/handles.csv
|
|
||||||
echo "Finishing handles export..."
|
|
||||||
handles_finished=$(date -Iseconds --utc)
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle_month'"
|
|
||||||
|
|
||||||
echo "Export finished."
|
|
|
@ -1,28 +0,0 @@
|
||||||
.dashboard | . as $dash
|
|
||||||
|
|
||||||
| [paths(type == "object"
|
|
||||||
and (.datasource?.uid? | type) == "string"
|
|
||||||
and .datasource.type? == "prometheus")] as $uids
|
|
||||||
|
|
||||||
| reduce $uids[] as $path ([]; ($dash | getpath($path).datasource.uid) as $uid | if [.[] == $uid] | any then . else . + [$uid] end)
|
|
||||||
| . as $unique_uids
|
|
||||||
|
|
||||||
| [range($unique_uids | length) | {key: $unique_uids[.], value: "DS\(.+1)"}]
|
|
||||||
| from_entries as $uid_map
|
|
||||||
|
|
||||||
| reduce $uids[] as $path ($dash; setpath($path + ["datasource", "uid"]; "${\($uid_map[getpath($path).datasource.uid])}"))
|
|
||||||
|
|
||||||
| reduce paths(type == "object" and has("current") and has("datasource"))
|
|
||||||
as $path (.; setpath($path + ["current"]; {}))
|
|
||||||
|
|
||||||
| .id = null
|
|
||||||
| .__inputs = [$unique_uids[] | {
|
|
||||||
name: $uid_map[.],
|
|
||||||
label: "Prometheus",
|
|
||||||
description: "",
|
|
||||||
type: "datasource",
|
|
||||||
pluginId: "prometheus",
|
|
||||||
pluginName: "Prometheus",
|
|
||||||
}]
|
|
||||||
| .__requires = []
|
|
||||||
| .__elements = {}
|
|
|
@ -1,13 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
|
||||||
|
|
||||||
. ../.env
|
|
||||||
|
|
||||||
: ${DASHBOARD_NAME:=indexer}
|
|
||||||
: ${DASHBOARD_UID:="$(jq -r .uid "${DASHBOARD_NAME}.json")"}
|
|
||||||
|
|
||||||
|
|
||||||
curl -s --fail-with-body "${GRAFANA_URL}/api/dashboards/uid/${DASHBOARD_UID}" | jq --sort-keys -f export.jq > "${DASHBOARD_NAME}.json"
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,18 +0,0 @@
|
||||||
$current[0].dashboard as $cur
|
|
||||||
| ([$cur | .. | select(.datasource?.type? == "prometheus")] | first | .datasource.uid) as $datasource
|
|
||||||
|
|
||||||
| .templating.list = [
|
|
||||||
.templating.list[] | .name as $name
|
|
||||||
| .current = ($cur.templating.list[] | select(.name == $name) | .current) // {}
|
|
||||||
]
|
|
||||||
|
|
||||||
| . as $dash
|
|
||||||
|
|
||||||
| [paths(type == "object"
|
|
||||||
and .datasource.type? == "prometheus")] as $uids
|
|
||||||
|
|
||||||
| reduce $uids[] as $path ($dash; setpath($path + ["datasource", "uid"]; $datasource))
|
|
||||||
|
|
||||||
| .id = $cur.id
|
|
||||||
| .version = $cur.version
|
|
||||||
| {dashboard: ., overwrite: false}
|
|
|
@ -1,24 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
|
||||||
|
|
||||||
. ../.env
|
|
||||||
|
|
||||||
: ${DASHBOARD_NAME:=indexer}
|
|
||||||
: ${DASHBOARD_UID:="$(jq -r .uid "${DASHBOARD_NAME}.json")"}
|
|
||||||
|
|
||||||
if ! curl -X HEAD -s --fail-with-body "${GRAFANA_URL}/api/dashboards/uid/${DASHBOARD_UID}"; then
|
|
||||||
echo "Dashboard with UID ${DASHBOARD_UID} is not found. Please import $(dirname "$0")/${DASHBOARD_NAME}.json once, and later use this command again to update it." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
CUR_DASHBOARD="$(mktemp -t "${DASHBOARD_NAME}.json.XXXXXXX")"
|
|
||||||
curl -s --fail-with-body "${GRAFANA_URL}/api/dashboards/uid/${DASHBOARD_UID}" > "${CUR_DASHBOARD}"
|
|
||||||
|
|
||||||
jq --slurpfile current "${CUR_DASHBOARD}" \
|
|
||||||
-f update.jq "${DASHBOARD_NAME}.json" \
|
|
||||||
| curl --json @- -s --fail-with-body "${GRAFANA_URL}/api/dashboards/db"
|
|
||||||
|
|
||||||
rm "${CUR_DASHBOARD}"
|
|
|
@ -1,5 +0,0 @@
|
||||||
# DB migrations
|
|
||||||
|
|
||||||
WARNING: due to partitioning schema changes (which require re-creating the
|
|
||||||
tables from scratch), some migrations were **edited**. Their previous versions
|
|
||||||
have been copied to `migrations/obsolete` folder.
|
|
|
@ -1,22 +0,0 @@
|
||||||
insert into pds (host) values ('https://agaric.us-west.host.bsky.network'),
|
|
||||||
('https://amanita.us-east.host.bsky.network'),
|
|
||||||
('https://blewit.us-west.host.bsky.network'),
|
|
||||||
('https://boletus.us-west.host.bsky.network'),
|
|
||||||
('https://bsky.social'),
|
|
||||||
('https://chaga.us-west.host.bsky.network'),
|
|
||||||
('https://conocybe.us-west.host.bsky.network'),
|
|
||||||
('https://enoki.us-east.host.bsky.network'),
|
|
||||||
('https://hydnum.us-west.host.bsky.network'),
|
|
||||||
('https://inkcap.us-east.host.bsky.network'),
|
|
||||||
('https://lepista.us-west.host.bsky.network'),
|
|
||||||
('https://lionsmane.us-east.host.bsky.network'),
|
|
||||||
('https://maitake.us-west.host.bsky.network'),
|
|
||||||
('https://morel.us-east.host.bsky.network'),
|
|
||||||
('https://oyster.us-east.host.bsky.network'),
|
|
||||||
('https://porcini.us-east.host.bsky.network'),
|
|
||||||
('https://puffball.us-east.host.bsky.network'),
|
|
||||||
('https://russula.us-west.host.bsky.network'),
|
|
||||||
('https://shiitake.us-east.host.bsky.network'),
|
|
||||||
('https://shimeji.us-east.host.bsky.network'),
|
|
||||||
('https://verpa.us-west.host.bsky.network')
|
|
||||||
on conflict do nothing;
|
|
|
@ -1,93 +0,0 @@
|
||||||
\timing
|
|
||||||
|
|
||||||
CREATE EXTENSION pg_partman SCHEMA public;
|
|
||||||
|
|
||||||
alter table records rename to records_like;
|
|
||||||
|
|
||||||
create table records
|
|
||||||
(like records_like including defaults)
|
|
||||||
partition by list (collection);
|
|
||||||
|
|
||||||
drop index idx_repo_record_key;
|
|
||||||
drop index idx_repo_rev;
|
|
||||||
alter sequence records_id_seq owned by records.id;
|
|
||||||
|
|
||||||
drop table records_like;
|
|
||||||
|
|
||||||
create index on records (collection, repo, rkey);
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION setup_partition(in collection text, in suffix text) RETURNS boolean AS $$
|
|
||||||
BEGIN
|
|
||||||
EXECUTE 'CREATE TABLE records_' || suffix ||
|
|
||||||
' PARTITION OF records FOR VALUES IN (' || quote_literal(collection) || ')
|
|
||||||
PARTITION BY RANGE (created_at)';
|
|
||||||
EXECUTE 'CREATE INDEX ON records_' || suffix || ' (created_at)';
|
|
||||||
EXECUTE 'alter table records_' || suffix || ' add check (collection = ' || quote_literal(collection) || ')';
|
|
||||||
|
|
||||||
PERFORM public.create_parent('public.records_' || suffix, 'created_at', '1 month',
|
|
||||||
p_start_partition := '2024-02-01');
|
|
||||||
RETURN true;
|
|
||||||
END;
|
|
||||||
$$ LANGUAGE plpgsql;
|
|
||||||
|
|
||||||
|
|
||||||
select setup_partition('app.bsky.feed.like', 'like');
|
|
||||||
select setup_partition('app.bsky.feed.post', 'post');
|
|
||||||
select setup_partition('app.bsky.graph.follow', 'follow');
|
|
||||||
select setup_partition('app.bsky.graph.block', 'block');
|
|
||||||
select setup_partition('app.bsky.feed.repost', 'repost');
|
|
||||||
select setup_partition('app.bsky.actor.profile', 'profile');
|
|
||||||
select setup_partition('app.bsky.graph.list', 'list');
|
|
||||||
select setup_partition('app.bsky.graph.listblock', 'listblock');
|
|
||||||
select setup_partition('app.bsky.graph.listitem', 'listitem');
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE records_default
|
|
||||||
PARTITION OF records DEFAULT
|
|
||||||
PARTITION BY RANGE (created_at);
|
|
||||||
CREATE INDEX ON records_default (created_at);
|
|
||||||
|
|
||||||
SELECT public.create_parent('public.records_default', 'created_at', '1 month',
|
|
||||||
p_start_partition := '2024-02-01');
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
create index idx_like_subject
|
|
||||||
on records_like
|
|
||||||
(split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3));
|
|
||||||
|
|
||||||
create index idx_follow_subject
|
|
||||||
on records_follow
|
|
||||||
(jsonb_extract_path_text(content, 'subject'));
|
|
||||||
|
|
||||||
create index idx_reply_subject
|
|
||||||
on records_post
|
|
||||||
(split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3));
|
|
||||||
|
|
||||||
create index listitem_uri_subject
|
|
||||||
on records_listitem
|
|
||||||
(
|
|
||||||
jsonb_extract_path_text(content, 'list'),
|
|
||||||
jsonb_extract_path_text(content, 'subject'))
|
|
||||||
include (deleted);
|
|
||||||
|
|
||||||
create index listitem_subject_uri
|
|
||||||
on records_listitem
|
|
||||||
(
|
|
||||||
jsonb_extract_path_text(content, 'subject'),
|
|
||||||
jsonb_extract_path_text(content, 'list'))
|
|
||||||
include (deleted);
|
|
||||||
|
|
||||||
create view listitems as
|
|
||||||
select *, jsonb_extract_path_text(content, 'list') as list,
|
|
||||||
jsonb_extract_path_text(content, 'subject') as subject
|
|
||||||
from records_listitem;
|
|
||||||
|
|
||||||
|
|
||||||
create view lists as
|
|
||||||
select records_list.*,
|
|
||||||
jsonb_extract_path_text(content, 'name') as name,
|
|
||||||
jsonb_extract_path_text(content, 'description') as description,
|
|
||||||
jsonb_extract_path_text(content, 'purpose') as purpose,
|
|
||||||
'at://' || repos.did || '/app.bsky.graph.list/' || rkey as uri
|
|
||||||
from records_list join repos on records_list.repo = repos.id;
|
|
|
@ -1,28 +0,0 @@
|
||||||
create index post_langs on records_post using gin (jsonb_extract_path(content, 'langs') jsonb_ops);
|
|
||||||
|
|
||||||
-- There are invalid/non-conforming values that need to be handled somehow.
|
|
||||||
create function parse_timestamp(text)
|
|
||||||
returns timestamp
|
|
||||||
returns null on null input
|
|
||||||
immutable
|
|
||||||
as
|
|
||||||
$$
|
|
||||||
begin
|
|
||||||
begin
|
|
||||||
return $1::timestamp;
|
|
||||||
exception
|
|
||||||
when others then
|
|
||||||
return null;
|
|
||||||
end;
|
|
||||||
end;
|
|
||||||
$$
|
|
||||||
language plpgsql;
|
|
||||||
|
|
||||||
create index post_created_at on records_post (parse_timestamp(jsonb_extract_path_text(content, 'createdAt')));
|
|
||||||
|
|
||||||
create view posts as
|
|
||||||
select *, jsonb_extract_path(content, 'langs') as langs,
|
|
||||||
parse_timestamp(jsonb_extract_path_text(content, 'createdAt')) as content_created_at
|
|
||||||
from records_post;
|
|
||||||
|
|
||||||
explain select count(*) from posts where langs ? 'uk' and content_created_at > now() - interval '1 day';
|
|
|
@ -1,46 +0,0 @@
|
||||||
-- Create a bunch of materialized views, but don't populate them right away.
|
|
||||||
|
|
||||||
create materialized view export_follows
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
records.content ->> 'subject' as ":END_ID"
|
|
||||||
from repos join records on repos.id = records.repo
|
|
||||||
where records.collection = 'app.bsky.graph.follow'
|
|
||||||
and records.content ->> 'subject' <> repos.did
|
|
||||||
with no data;
|
|
||||||
create index export_follow_subject on export_follows (":END_ID");
|
|
||||||
|
|
||||||
-- Thanks to `join`, eats up 30GB+ of space while refreshing, but
|
|
||||||
-- finishes in under an hour.
|
|
||||||
create materialized view export_likes
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.like'
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_like_subject on export_likes (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_replies
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.post'
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_reply_subject on export_replies (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_dids
|
|
||||||
as select distinct did as "did:ID" from (
|
|
||||||
select did from repos
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_follows
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_likes
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_replies
|
|
||||||
)
|
|
||||||
with no data;
|
|
|
@ -1,10 +0,0 @@
|
||||||
-- Create a block materialized view, don't populate right away.
|
|
||||||
|
|
||||||
create materialized view export_blocks
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
records.content ->> 'subject' as ":END_ID"
|
|
||||||
from repos join records on repos.id = records.repo
|
|
||||||
where records.collection = 'app.bsky.graph.block'
|
|
||||||
and records.content ->> 'subject' <> repos.did
|
|
||||||
with no data;
|
|
||||||
create index export_block_subject on export_blocks (":END_ID");
|
|
|
@ -1,16 +0,0 @@
|
||||||
CREATE TABLE incremental_export_log (
|
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
collection text NOT NULL,
|
|
||||||
to_tsmp TIMESTAMP NOT NULL,
|
|
||||||
started TIMESTAMP,
|
|
||||||
finished TIMESTAMP,
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE UNIQUE INDEX incremental_export_log_idx on incremental_export_log ("collection", "to_tsmp");
|
|
||||||
|
|
||||||
-- manually insert your latest snapshot here
|
|
||||||
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'app.bsky.graph.follow');
|
|
||||||
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'app.bsky.feed.like');
|
|
||||||
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'app.bsky.feed.post');
|
|
||||||
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'did');
|
|
||||||
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'handle');
|
|
|
@ -1,16 +0,0 @@
|
||||||
|
|
||||||
drop materialized view export_dids;
|
|
||||||
|
|
||||||
create materialized view export_dids
|
|
||||||
as select distinct did as "did:ID" from (
|
|
||||||
select did from repos
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_follows
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_likes
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_replies
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_blocks
|
|
||||||
)
|
|
||||||
with no data;
|
|
|
@ -1,5 +0,0 @@
|
||||||
drop materialized view export_optouts;
|
|
||||||
|
|
||||||
create materialized view export_optouts
|
|
||||||
as select did as "did:ID" from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:plc:qevje4db3tazfbbialrlrkza%'
|
|
||||||
with no data;
|
|
|
@ -1,62 +0,0 @@
|
||||||
-- Create a bunch of materialized views, but don't populate them right away.
|
|
||||||
|
|
||||||
create materialized view export_follows_month
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
records.content ->> 'subject' as ":END_ID"
|
|
||||||
from repos join records on repos.id = records.repo
|
|
||||||
where records.collection = 'app.bsky.graph.follow'
|
|
||||||
and records.content ->> 'subject' <> repos.did
|
|
||||||
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
|
||||||
with no data;
|
|
||||||
create index export_follow_subject_month on export_follows_month (":END_ID");
|
|
||||||
|
|
||||||
-- Thanks to `join`, eats up 30GB+ of space while refreshing, but
|
|
||||||
-- finishes in under an hour.
|
|
||||||
create materialized view export_likes_month
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.like'
|
|
||||||
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_like_subject_month on export_likes_month (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_replies_month
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.post'
|
|
||||||
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_reply_subject_month on export_replies_month (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_blocks_month
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
records.content ->> 'subject' as ":END_ID"
|
|
||||||
from repos join records on repos.id = records.repo
|
|
||||||
where records.collection = 'app.bsky.graph.block'
|
|
||||||
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
|
||||||
and records.content ->> 'subject' <> repos.did
|
|
||||||
with no data;
|
|
||||||
create index export_block_subject_month on export_blocks_month (":END_ID");
|
|
||||||
|
|
||||||
|
|
||||||
create materialized view export_dids_month
|
|
||||||
as select distinct did as "did:ID" from (
|
|
||||||
select did from repos
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_follows_month
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_likes_month
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_replies_month
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_blocks_month
|
|
||||||
)
|
|
||||||
with no data;
|
|
|
@ -1,47 +0,0 @@
|
||||||
drop materialized view export_dids_ladder;
|
|
||||||
drop materialized view export_replies_ladder;
|
|
||||||
drop materialized view export_likes_ladder;
|
|
||||||
|
|
||||||
create materialized view export_likes_ladder
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.like'
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_like_subject_ladder on export_likes_ladder (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_replies_ladder
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.post'
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_reply_subject_ladder on export_replies_ladder (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_dids_ladder
|
|
||||||
as select distinct did as "did:ID" from (
|
|
||||||
select did from repos
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_follows
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_likes_ladder
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_replies_ladder
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_blocks
|
|
||||||
)
|
|
||||||
with no data;
|
|
||||||
|
|
||||||
create index idx_records_created_at on records (created_at);
|
|
|
@ -1,47 +0,0 @@
|
||||||
drop materialized view export_dids_ladder;
|
|
||||||
drop materialized view export_replies_ladder;
|
|
||||||
drop materialized view export_likes_ladder;
|
|
||||||
|
|
||||||
create materialized view export_likes_ladder
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.like'
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
and repos.did like 'did:%'
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_like_subject_ladder on export_likes_ladder (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_replies_ladder
|
|
||||||
as select repos.did as ":START_ID",
|
|
||||||
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
|
|
||||||
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
|
|
||||||
from records join repos on records.repo = repos.id
|
|
||||||
where records.collection = 'app.bsky.feed.post'
|
|
||||||
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
and repos.did like 'did:%'
|
|
||||||
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
|
||||||
with no data;
|
|
||||||
create index export_reply_subject_ladder on export_replies_ladder (":END_ID");
|
|
||||||
|
|
||||||
create materialized view export_dids_ladder
|
|
||||||
as select distinct did as "did:ID" from (
|
|
||||||
select did from repos
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_follows
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_likes_ladder
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_replies_ladder
|
|
||||||
union
|
|
||||||
select distinct ":END_ID" as did from export_blocks
|
|
||||||
)
|
|
||||||
with no data;
|
|
|
@ -1,62 +0,0 @@
|
||||||
alter table records rename to records_like;
|
|
||||||
|
|
||||||
create table records
|
|
||||||
(like records_like including defaults)
|
|
||||||
partition by list (collection);
|
|
||||||
|
|
||||||
drop index idx_repo_record_key;
|
|
||||||
drop index idx_repo_rev;
|
|
||||||
alter table records_like drop constraint records_pkey;
|
|
||||||
create unique index records_pkey on records (id, collection);
|
|
||||||
|
|
||||||
create table records_default
|
|
||||||
partition of records default;
|
|
||||||
|
|
||||||
create table records_post
|
|
||||||
partition of records for values in ('app.bsky.feed.post');
|
|
||||||
create table records_follow
|
|
||||||
partition of records for values in ('app.bsky.graph.follow');
|
|
||||||
create table records_block
|
|
||||||
partition of records for values in ('app.bsky.graph.block');
|
|
||||||
create table records_repost
|
|
||||||
partition of records for values in ('app.bsky.feed.repost');
|
|
||||||
create table records_profile
|
|
||||||
partition of records for values in ('app.bsky.actor.profile');
|
|
||||||
|
|
||||||
ALTER TABLE records_like
|
|
||||||
ADD CHECK (collection in ('app.bsky.feed.like'));
|
|
||||||
|
|
||||||
ALTER TABLE records_post
|
|
||||||
ADD CHECK (collection in ('app.bsky.feed.post'));
|
|
||||||
|
|
||||||
ALTER TABLE records_follow
|
|
||||||
ADD CHECK (collection in ('app.bsky.graph.follow'));
|
|
||||||
|
|
||||||
ALTER TABLE records_repost
|
|
||||||
ADD CHECK (collection in ('app.bsky.feed.repost'));
|
|
||||||
|
|
||||||
ALTER TABLE records_profile
|
|
||||||
ADD CHECK (collection in ('app.bsky.actor.profile'));
|
|
||||||
|
|
||||||
-- SLOW, can run overnight
|
|
||||||
with moved_rows as (
|
|
||||||
delete from records_like r
|
|
||||||
where collection <> 'app.bsky.feed.like'
|
|
||||||
returning r.*
|
|
||||||
)
|
|
||||||
insert into records select * from moved_rows;
|
|
||||||
|
|
||||||
-- ULTRA SLOW, DO NOT RUN on large DB
|
|
||||||
alter table records attach partition records_like for values in ('app.bsky.feed.like');
|
|
||||||
|
|
||||||
create index idx_like_subject
|
|
||||||
on records_like
|
|
||||||
(split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3));
|
|
||||||
|
|
||||||
create index idx_follow_subject
|
|
||||||
on records_follow
|
|
||||||
(jsonb_extract_path_text(content, 'subject'));
|
|
||||||
|
|
||||||
create index idx_reply_subject
|
|
||||||
on records_post
|
|
||||||
(split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3));
|
|
|
@ -1,54 +0,0 @@
|
||||||
alter table records detach partition records_default;
|
|
||||||
|
|
||||||
create table records_list
|
|
||||||
partition of records for values in ('app.bsky.graph.list');
|
|
||||||
create table records_listblock
|
|
||||||
partition of records for values in ('app.bsky.graph.listblock');
|
|
||||||
create table records_listitem
|
|
||||||
partition of records for values in ('app.bsky.graph.listitem');
|
|
||||||
|
|
||||||
ALTER TABLE records_list
|
|
||||||
ADD CHECK (collection in ('app.bsky.graph.list'));
|
|
||||||
|
|
||||||
ALTER TABLE records_listblock
|
|
||||||
ADD CHECK (collection in ('app.bsky.graph.listblock'));
|
|
||||||
|
|
||||||
ALTER TABLE records_listitem
|
|
||||||
ADD CHECK (collection in ('app.bsky.graph.listitem'));
|
|
||||||
|
|
||||||
with moved_rows as (
|
|
||||||
delete from records_default r
|
|
||||||
where collection in ('app.bsky.graph.list', 'app.bsky.graph.listblock', 'app.bsky.graph.listitem')
|
|
||||||
returning r.*
|
|
||||||
)
|
|
||||||
insert into records select * from moved_rows;
|
|
||||||
|
|
||||||
alter table records attach partition records_default default;
|
|
||||||
|
|
||||||
create index listitem_uri_subject
|
|
||||||
on records_listitem
|
|
||||||
(
|
|
||||||
jsonb_extract_path_text(content, 'list'),
|
|
||||||
jsonb_extract_path_text(content, 'subject'))
|
|
||||||
include (deleted);
|
|
||||||
|
|
||||||
create index listitem_subject_uri
|
|
||||||
on records_listitem
|
|
||||||
(
|
|
||||||
jsonb_extract_path_text(content, 'subject'),
|
|
||||||
jsonb_extract_path_text(content, 'list'))
|
|
||||||
include (deleted);
|
|
||||||
|
|
||||||
create view listitems as
|
|
||||||
select *, jsonb_extract_path_text(content, 'list') as list,
|
|
||||||
jsonb_extract_path_text(content, 'subject') as subject
|
|
||||||
from records_listitem;
|
|
||||||
|
|
||||||
|
|
||||||
create view lists as
|
|
||||||
select records_list.*,
|
|
||||||
jsonb_extract_path_text(content, 'name') as name,
|
|
||||||
jsonb_extract_path_text(content, 'description') as description,
|
|
||||||
jsonb_extract_path_text(content, 'purpose') as purpose,
|
|
||||||
'at://' || repos.did || '/app.bsky.graph.list/' || rkey as uri
|
|
||||||
from records_list join repos on records_list.repo = repos.id;
|
|
|
@ -1,23 +0,0 @@
|
||||||
DROP VIEW posts;
|
|
||||||
DROP VIEW lists;
|
|
||||||
DROP VIEW listitems;
|
|
||||||
|
|
||||||
ALTER TABLE "records" ALTER COLUMN "deleted" TYPE boolean USING "deleted"::boolean;
|
|
||||||
|
|
||||||
create view posts as
|
|
||||||
select *, jsonb_extract_path(content, 'langs') as langs,
|
|
||||||
parse_timestamp(jsonb_extract_path_text(content, 'createdAt')) as content_created_at
|
|
||||||
from records_post;
|
|
||||||
|
|
||||||
create view lists as
|
|
||||||
select records_list.*,
|
|
||||||
jsonb_extract_path_text(content, 'name') as name,
|
|
||||||
jsonb_extract_path_text(content, 'description') as description,
|
|
||||||
jsonb_extract_path_text(content, 'purpose') as purpose,
|
|
||||||
'at://' || repos.did || '/app.bsky.graph.list/' || rkey as uri
|
|
||||||
from records_list join repos on records_list.repo = repos.id;
|
|
||||||
|
|
||||||
create view listitems as
|
|
||||||
select *, jsonb_extract_path_text(content, 'list') as list,
|
|
||||||
jsonb_extract_path_text(content, 'subject') as subject
|
|
||||||
from records_listitem;
|
|
|
@ -1,19 +0,0 @@
|
||||||
# See https://docs.docker.com/compose/multiple-compose-files/merge/ for how
|
|
||||||
# exactly these overrides get applied to the main file.
|
|
||||||
# tl;dr: strings and numbers get overwritten, lists get concatenated
|
|
||||||
services:
|
|
||||||
# Expose PostgreSQL TCP port
|
|
||||||
postgres:
|
|
||||||
ports:
|
|
||||||
- "0.0.0.0:15432:5432"
|
|
||||||
|
|
||||||
# Change the default number of indexer threads
|
|
||||||
record-indexer:
|
|
||||||
environment:
|
|
||||||
INDEXER_WORKERS: 15
|
|
||||||
|
|
||||||
# Enable PDS discovery via a relay
|
|
||||||
consumer:
|
|
||||||
environment:
|
|
||||||
CONSUMER_RELAYS: "https://bsky.network"
|
|
||||||
|
|
|
@ -1,11 +1,6 @@
|
||||||
services:
|
services:
|
||||||
postgres:
|
postgres:
|
||||||
# image: "postgres:16"
|
image: "postgres:16"
|
||||||
build:
|
|
||||||
context: ./docker
|
|
||||||
dockerfile_inline: |
|
|
||||||
FROM postgres:16
|
|
||||||
RUN apt update && apt install -y postgresql-16-partman
|
|
||||||
volumes:
|
volumes:
|
||||||
- "${DATA_DIR:?specify data dir in .env file}/postgres:/var/lib/postgresql/data:rw"
|
- "${DATA_DIR:?specify data dir in .env file}/postgres:/var/lib/postgresql/data:rw"
|
||||||
restart: always
|
restart: always
|
||||||
|
@ -21,37 +16,8 @@ services:
|
||||||
environment:
|
environment:
|
||||||
POSTGRES_DB: bluesky
|
POSTGRES_DB: bluesky
|
||||||
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD:?specify password in .env file}"
|
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD:?specify password in .env file}"
|
||||||
command: [
|
|
||||||
"-c", "max_connections=1000",
|
|
||||||
"-c", "shared_buffers=8GB",
|
|
||||||
"-c", "work_mem=2GB",
|
|
||||||
"-c", "max_parallel_workers_per_gather=8",
|
|
||||||
"-c", "max_wal_size=8GB",
|
|
||||||
"-c", "shared_preload_libraries=pg_partman_bgw",
|
|
||||||
"-c", "pg_partman_bgw.interval=3600",
|
|
||||||
"-c", "pg_partman_bgw.role=postgres",
|
|
||||||
"-c", "pg_partman_bgw.dbname=bluesky",
|
|
||||||
]
|
|
||||||
shm_size: '16gb'
|
|
||||||
stop_grace_period: 24h
|
stop_grace_period: 24h
|
||||||
|
|
||||||
update-db-schema:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: cmd/update-db-schema/Dockerfile
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
restart: on-failure
|
|
||||||
image: uabluerail/update-db-schema
|
|
||||||
links:
|
|
||||||
- postgres:db
|
|
||||||
depends_on:
|
|
||||||
postgres:
|
|
||||||
condition: service_healthy
|
|
||||||
environment:
|
|
||||||
UPDATE-DB-SCHEMA_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
|
|
||||||
command: [ "--log-level=0" ]
|
|
||||||
|
|
||||||
plc:
|
plc:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
|
@ -69,133 +35,16 @@ services:
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
update-db-schema:
|
|
||||||
condition: service_completed_successfully
|
|
||||||
environment:
|
environment:
|
||||||
PLC_METRICS_PORT: '8080'
|
PLC_METRICS_PORT: '8080'
|
||||||
PLC_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
|
PLC_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
|
||||||
ports:
|
ports:
|
||||||
- "${METRICS_ADDR:-0.0.0.0}:11004:8080"
|
- "0.0.0.0:11004:8080"
|
||||||
command: [ "--log-level=0" ]
|
command: [ "--log-level=0" ]
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8080/ready"]
|
test: ["CMD", "curl", "-f", "http://localhost:8080/ready"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 30
|
retries: 30
|
||||||
start_period: 6h
|
start_period: 12h
|
||||||
start_interval: 15s
|
start_interval: 15s
|
||||||
|
|
||||||
lister:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: cmd/lister/Dockerfile
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
restart: always
|
|
||||||
image: uabluerail/repo-lister
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: 1G
|
|
||||||
links:
|
|
||||||
- postgres:db
|
|
||||||
- plc:plc
|
|
||||||
depends_on:
|
|
||||||
postgres:
|
|
||||||
condition: service_healthy
|
|
||||||
plc:
|
|
||||||
condition: service_healthy
|
|
||||||
update-db-schema:
|
|
||||||
condition: service_completed_successfully
|
|
||||||
environment:
|
|
||||||
LISTER_METRICS_PORT: '8080'
|
|
||||||
LISTER_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
|
|
||||||
ATP_PLC_ADDR: "http://plc:8080"
|
|
||||||
ports:
|
|
||||||
- "${METRICS_ADDR:-0.0.0.0}:11001:8080"
|
|
||||||
command: [ "--log-level=0" ]
|
|
||||||
consumer:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: cmd/consumer/Dockerfile
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
restart: always
|
|
||||||
image: uabluerail/firehose-consumer
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: 1G
|
|
||||||
links:
|
|
||||||
- postgres:db
|
|
||||||
- plc:plc
|
|
||||||
depends_on:
|
|
||||||
postgres:
|
|
||||||
condition: service_healthy
|
|
||||||
plc:
|
|
||||||
condition: service_healthy
|
|
||||||
update-db-schema:
|
|
||||||
condition: service_completed_successfully
|
|
||||||
environment:
|
|
||||||
CONSUMER_METRICS_PORT: '8080'
|
|
||||||
CONSUMER_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
|
|
||||||
# CONSUMER_RELAYS: "https://bsky.network" # Effectively doubles inbound network traffic. Set this in docker-compose.override.yml if needed.
|
|
||||||
ATP_PLC_ADDR: "http://plc:8080"
|
|
||||||
ports:
|
|
||||||
- "${METRICS_ADDR:-0.0.0.0}:11002:8080"
|
|
||||||
command: [ "--log-level=0" ]
|
|
||||||
|
|
||||||
record-indexer:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: cmd/record-indexer/Dockerfile
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
restart: always
|
|
||||||
image: uabluerail/record-indexer
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: 4G
|
|
||||||
links:
|
|
||||||
- postgres:db
|
|
||||||
- plc:plc
|
|
||||||
depends_on:
|
|
||||||
postgres:
|
|
||||||
condition: service_healthy
|
|
||||||
plc:
|
|
||||||
condition: service_healthy
|
|
||||||
update-db-schema:
|
|
||||||
condition: service_completed_successfully
|
|
||||||
dns:
|
|
||||||
- 1.1.1.1
|
|
||||||
- 8.8.8.8
|
|
||||||
environment:
|
|
||||||
INDEXER_METRICS_PORT: '8080'
|
|
||||||
INDEXER_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
|
|
||||||
INDEXER_WORKERS: 50
|
|
||||||
ATP_PLC_ADDR: "http://plc:8080"
|
|
||||||
ports:
|
|
||||||
- "${METRICS_ADDR:-0.0.0.0}:11003:8080"
|
|
||||||
command: [ "--log-level=0" ]
|
|
||||||
|
|
||||||
query-exporter:
|
|
||||||
image: adonato/query-exporter:latest
|
|
||||||
environment:
|
|
||||||
POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@postgres:5432/bluesky?sslmode=disable"
|
|
||||||
volumes:
|
|
||||||
- "./metrics/prometheus/exporters/query-exporter/config.yaml:/config.yaml"
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
ports:
|
|
||||||
- "${METRICS_ADDR:-0.0.0.0}:9560:9560"
|
|
||||||
links:
|
|
||||||
- postgres:postgres
|
|
||||||
depends_on:
|
|
||||||
postgres:
|
|
||||||
condition: service_healthy
|
|
||||||
update-db-schema:
|
|
||||||
# Not a strict dependency, but it's better to not have it running
|
|
||||||
# unnecessary queries during a costly migration.
|
|
||||||
condition: service_completed_successfully
|
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
*
|
|
15
du.sql
15
du.sql
|
@ -1,15 +0,0 @@
|
||||||
SELECT
|
|
||||||
relname as table_name,
|
|
||||||
pg_size_pretty(pg_total_relation_size(relid)) As "Total Size",
|
|
||||||
pg_size_pretty(pg_indexes_size(relid)) as "Index Size",
|
|
||||||
pg_size_pretty(pg_table_size(relid)) as "Actual Size"
|
|
||||||
FROM pg_catalog.pg_statio_user_tables
|
|
||||||
ORDER BY pg_total_relation_size(relid) DESC;
|
|
||||||
|
|
||||||
SELECT
|
|
||||||
relname as table_name,
|
|
||||||
indexrelname as index_name,
|
|
||||||
pg_size_pretty(pg_table_size(indexrelid)) as "Index Size"
|
|
||||||
FROM pg_catalog.pg_statio_user_indexes
|
|
||||||
ORDER BY pg_table_size(indexrelid) DESC;
|
|
||||||
|
|
|
@ -1,9 +1,2 @@
|
||||||
POSTGRES_PASSWORD='some password'
|
POSTGRES_PASSWORD='some password'
|
||||||
DATA_DIR=
|
DATA_DIR=
|
||||||
CSV_DIR=
|
|
||||||
|
|
||||||
# IP address to expose HTTP ports on
|
|
||||||
METRICS_ADDR=0.0.0.0
|
|
||||||
|
|
||||||
# Grafana URL with username and password. Only needed if you're going to import the dashboard.
|
|
||||||
#GRAFANA_URL="https://<username>:<password>@<hostname>"
|
|
||||||
|
|
|
@ -1,97 +0,0 @@
|
||||||
# Graceful shutdown/restart
|
|
||||||
|
|
||||||
`docker compose stop lister`
|
|
||||||
`docker compose stop consumer`
|
|
||||||
`docker compose stop record-indexer`
|
|
||||||
|
|
||||||
Take a look at grafana, once all quiet
|
|
||||||
|
|
||||||
`docker compose stop postgres`
|
|
||||||
|
|
||||||
Start everything up
|
|
||||||
|
|
||||||
`docker compose up -d --build`
|
|
||||||
|
|
||||||
# Control number of workers
|
|
||||||
|
|
||||||
Full throttle
|
|
||||||
`curl 'localhost:11003/pool/resize?size=50'`
|
|
||||||
|
|
||||||
Half throttle (recommended)
|
|
||||||
`curl 'localhost:11003/pool/resize?size=25'`
|
|
||||||
|
|
||||||
Stop eating all of my Internet
|
|
||||||
`curl 'localhost:11003/pool/resize?size=10'`
|
|
||||||
|
|
||||||
# Peak into db
|
|
||||||
|
|
||||||
`docker compose exec -it postgres psql -U postgres -d bluesky`
|
|
||||||
|
|
||||||
Seen repos
|
|
||||||
`select count(*) from repos;`
|
|
||||||
|
|
||||||
Fully indexed repos
|
|
||||||
`select count(*) from repos where last_indexed_rev <> '' and (last_indexed_rev >= first_rev_since_reset or first_rev_since_reset is null or first_rev_since_reset = '');`
|
|
||||||
|
|
||||||
Get list blocks
|
|
||||||
|
|
||||||
non-partitioned (very slow)
|
|
||||||
|
|
||||||
```
|
|
||||||
select count(*) from (select distinct repo from records where collection in ('app.bsky.graph.listblock') and deleted=false and content['subject']::text like '"at://did:plc:bmjomljebcsuxolnygfgqtap/%');
|
|
||||||
```
|
|
||||||
|
|
||||||
partitioned (slow)
|
|
||||||
`select count(*) from (select distinct repo from records_listblock where deleted=false and content['subject']::text like '"at:///%');`
|
|
||||||
|
|
||||||
`select count(*) from (select distinct repo from records_listblock where deleted=false and (split_part(jsonb_extract_path_text(content, 'subject'), '/', 3))='did:plc:bmjomljebcsuxolnygfgqtap');`
|
|
||||||
|
|
||||||
Count all records
|
|
||||||
|
|
||||||
`analyze records; select relname, reltuples::int from pg_class where relname like 'records';`
|
|
||||||
|
|
||||||
View errors
|
|
||||||
|
|
||||||
`select last_error, count(*) from repos where failed_attempts > 0 group by last_error;`
|
|
||||||
|
|
||||||
Restart errors
|
|
||||||
|
|
||||||
`update repos set failed_attempts=0, last_error='' where failed_attempts >0;`
|
|
||||||
|
|
||||||
# MONITORING
|
|
||||||
|
|
||||||
More verbose logging for queries DEBUG1-DEBUG5
|
|
||||||
`set client_min_messages = 'DEBUG5';`
|
|
||||||
|
|
||||||
Take a look at slow queries
|
|
||||||
```
|
|
||||||
SELECT pid, age(clock_timestamp(), query_start), state, query
|
|
||||||
FROM pg_stat_activity
|
|
||||||
WHERE query != '<IDLE>' AND query NOT ILIKE '%pg_stat_activity%'
|
|
||||||
ORDER BY query_start asc;
|
|
||||||
```
|
|
||||||
|
|
||||||
Monitor index progress
|
|
||||||
`select * from pg_stat_progress_create_index;`
|
|
||||||
|
|
||||||
Explore new collection types
|
|
||||||
|
|
||||||
```
|
|
||||||
select * from records where collection not in (
|
|
||||||
'app.bsky.actor.profile',
|
|
||||||
'app.bsky.feed.generator',
|
|
||||||
'app.bsky.feed.like',
|
|
||||||
'app.bsky.feed.post',
|
|
||||||
'app.bsky.feed.repost',
|
|
||||||
'app.bsky.feed.threadgate',
|
|
||||||
'app.bsky.graph.block',
|
|
||||||
'app.bsky.graph.follow',
|
|
||||||
'app.bsky.graph.listitem',
|
|
||||||
'app.bsky.graph.list',
|
|
||||||
'app.bsky.graph.listblock'
|
|
||||||
) limit 20;
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
count listitems
|
|
||||||
`select count(*) from listitems where list='at://did:plc:2yqylcqgxier4l5uplp6w6jh/app.bsky.graph.list/3kkud7l6s4v2m';`
|
|
|
@ -1,29 +0,0 @@
|
||||||
{
|
|
||||||
"$type": "app.bsky.actor.profile",
|
|
||||||
"avatar": {
|
|
||||||
"ref": {
|
|
||||||
"/": "bafkreihcxxwlssxseaxa2dclcci3l6qpnhy25igjqmqhig44iddlxneymm"
|
|
||||||
},
|
|
||||||
"size": 169608,
|
|
||||||
"$type": "blob",
|
|
||||||
"mimeType": "image/jpeg"
|
|
||||||
},
|
|
||||||
"banner": {
|
|
||||||
"ref": {
|
|
||||||
"/": "bafkreiejgeq5mo4kxx5s4t3jpmxxr3kirdgv7ozkvfm4hfh3p7eaow6xyu"
|
|
||||||
},
|
|
||||||
"size": 272387,
|
|
||||||
"$type": "blob",
|
|
||||||
"mimeType": "image/jpeg"
|
|
||||||
},
|
|
||||||
"labels": {
|
|
||||||
"$type": "com.atproto.label.defs#selfLabels",
|
|
||||||
"values": [
|
|
||||||
{
|
|
||||||
"val": "!no-unauthenticated"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"description": "Full time parent, part time gamer. [He/Him]\n\nNeurodivergent [AuDHD] Demi 🦜\n\nSci-fi/Fantasy, video games, music, cats, anime, horror, crows, clothing, and human cognition are my jam. Not the extent of my interests - just what I'm willing to admit 😁 [NSFW]",
|
|
||||||
"displayName": "Flux 🤍🩶💜🖤"
|
|
||||||
}
|
|
|
@ -1,45 +0,0 @@
|
||||||
{
|
|
||||||
"did": "did:web:skyfeed.me",
|
|
||||||
"$type": "app.bsky.feed.generator",
|
|
||||||
"createdAt": "2024-02-11T18:10:26.365Z",
|
|
||||||
"description": "絵描きさんと繋がりたい\n創作クラスタさんと繋がりたい\nクラスタフォロー\nの単語が含まれているPostのフィードです",
|
|
||||||
"displayName": "絵描きさん探し",
|
|
||||||
"skyfeedBuilder": {
|
|
||||||
"blocks": [
|
|
||||||
{
|
|
||||||
"id": "aaajsgtnqrcm6",
|
|
||||||
"did": "did:plc:l425td4tg5lq7y5gsrvfyhp5",
|
|
||||||
"type": "input",
|
|
||||||
"inputType": "firehose",
|
|
||||||
"firehoseSeconds": 604800
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "aaajsgtnqqgya",
|
|
||||||
"type": "remove",
|
|
||||||
"subject": "language",
|
|
||||||
"language": "ja"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "aaajsgtnqqobo",
|
|
||||||
"type": "regex",
|
|
||||||
"value": "絵描きさんと繋がりたい|創作クラスタさんと繋がりたい|クラスタフォロー",
|
|
||||||
"target": "text|alt_text",
|
|
||||||
"caseSensitive": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "aaajsrd2o422c",
|
|
||||||
"type": "remove",
|
|
||||||
"value": "0",
|
|
||||||
"subject": "image_count"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "aaajsgtnqsjne",
|
|
||||||
"type": "sort",
|
|
||||||
"sortType": "created_at",
|
|
||||||
"sortDirection": "desc"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"license": "EUPL-1.2",
|
|
||||||
"displayName": "絵描きさん探し"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"$type": "app.bsky.feed.like",
|
|
||||||
"subject": {
|
|
||||||
"cid": "bafyreiacuywksad5m72btsueyedsirbfamtovqdfdof2ulg2io7oofziv4",
|
|
||||||
"uri": "at://did:plc:iq5uninsn3ovpycv7rkth3ik/app.bsky.feed.post/3kjbob2uwra25"
|
|
||||||
},
|
|
||||||
"createdAt": "2024-02-06T15:23:11.641Z"
|
|
||||||
}
|
|
|
@ -1,18 +0,0 @@
|
||||||
{
|
|
||||||
"text": "Чесно кажучи це один з найкращих епізодів, на ряду з безсмертним другом",
|
|
||||||
"$type": "app.bsky.feed.post",
|
|
||||||
"langs": [
|
|
||||||
"uk"
|
|
||||||
],
|
|
||||||
"reply": {
|
|
||||||
"root": {
|
|
||||||
"cid": "bafyreienbpdlpqqwcovc56lgao2botzjleuqjocitapq5f2eficz2j2hdy",
|
|
||||||
"uri": "at://did:plc:wymxmgvtvzuumvldtnz76aez/app.bsky.feed.post/3kjr2i3gsl22v"
|
|
||||||
},
|
|
||||||
"parent": {
|
|
||||||
"cid": "bafyreienbpdlpqqwcovc56lgao2botzjleuqjocitapq5f2eficz2j2hdy",
|
|
||||||
"uri": "at://did:plc:wymxmgvtvzuumvldtnz76aez/app.bsky.feed.post/3kjr2i3gsl22v"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"createdAt": "2024-01-24T23:37:39.813Z"
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"$type": "app.bsky.feed.repost",
|
|
||||||
"subject": {
|
|
||||||
"cid": "bafyreiglj4rlihlxraqqr7wvea2zybrk3ddugwk42qwiemwrytvnchc4hy",
|
|
||||||
"uri": "at://did:plc:zvouh5woyfppe4gp6er354dl/app.bsky.feed.post/3kj3gjrmn7r2o"
|
|
||||||
},
|
|
||||||
"createdAt": "2024-01-16T12:28:45.555Z"
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"post": "at://did:plc:gfrzrhrhzfrocrqnnutnuhk4/app.bsky.feed.post/3kgbzbuxh462c",
|
|
||||||
"$type": "app.bsky.feed.threadgate",
|
|
||||||
"allow": [],
|
|
||||||
"createdAt": "2023-12-11T18:07:47.314Z"
|
|
||||||
}
|
|
|
@ -1,5 +0,0 @@
|
||||||
{
|
|
||||||
"$type": "app.bsky.graph.block",
|
|
||||||
"subject": "did:plc:yp6otbdle4znllf2wxf5vrzx",
|
|
||||||
"createdAt": "2023-11-16T17:20:56.410Z"
|
|
||||||
}
|
|
|
@ -1,5 +0,0 @@
|
||||||
{
|
|
||||||
"$type": "app.bsky.graph.follow",
|
|
||||||
"subject": "did:plc:cwcgqihgua35pkw6j4iqvv7o",
|
|
||||||
"createdAt": "2023-09-19T13:31:21.477Z"
|
|
||||||
}
|
|
|
@ -1,24 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"name": "絵師",
|
|
||||||
"$type": "app.bsky.graph.list",
|
|
||||||
"purpose": "app.bsky.graph.defs#curatelist",
|
|
||||||
"createdAt": "2024-02-07T03:28:14.317Z",
|
|
||||||
"description": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "AI Art Bros",
|
|
||||||
"$type": "app.bsky.graph.list",
|
|
||||||
"avatar": {
|
|
||||||
"ref": {
|
|
||||||
"/": "bafkreicsl3dkam2uswcmck3j7xt7nvqwuxnpwydnkqeu5ncsgf22er46iu"
|
|
||||||
},
|
|
||||||
"size": 58526,
|
|
||||||
"$type": "blob",
|
|
||||||
"mimeType": "image/jpeg"
|
|
||||||
},
|
|
||||||
"purpose": "app.bsky.graph.defs#modlist",
|
|
||||||
"createdAt": "2024-02-07T21:29:36.219Z",
|
|
||||||
"description": "A list of ai art makers. We like to put our soul in our art. Get outta here! This list will expand overtime as I catch em >:D"
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -1,5 +0,0 @@
|
||||||
{
|
|
||||||
"$type": "app.bsky.graph.listblock",
|
|
||||||
"subject": "at://did:plc:aobkgz6khzavtdmd5ng3ilme/app.bsky.graph.list/3k6xopgz3xc23",
|
|
||||||
"createdAt": "2024-01-05T23:19:05.067Z"
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"list": "at://did:plc:nqouwdgddza2z3vwlxs73t4x/app.bsky.graph.list/3kksd6ja5622n",
|
|
||||||
"$type": "app.bsky.graph.listitem",
|
|
||||||
"subject": "did:plc:rd5rkwppbfxlfegsyddk24oz",
|
|
||||||
"createdAt": "2024-02-07T03:28:23.842Z"
|
|
||||||
}
|
|
|
@ -1,22 +0,0 @@
|
||||||
.PHONY: all build up update down
|
|
||||||
|
|
||||||
# ---------------------------- Docker ----------------------------
|
|
||||||
|
|
||||||
all:
|
|
||||||
go test -v ./...
|
|
||||||
|
|
||||||
.env:
|
|
||||||
@cp example.env .env
|
|
||||||
@echo "Please edit .env to suit your environment before proceeding"
|
|
||||||
@exit 1
|
|
||||||
|
|
||||||
build: .env
|
|
||||||
@docker compose build
|
|
||||||
|
|
||||||
up: .env
|
|
||||||
@docker compose up -d --build
|
|
||||||
|
|
||||||
update: up
|
|
||||||
|
|
||||||
down:
|
|
||||||
@docker compose down
|
|
|
@ -1,17 +0,0 @@
|
||||||
# To start prometheus + grafana
|
|
||||||
|
|
||||||
`cd metrics`
|
|
||||||
|
|
||||||
`docker compose up -d --build`
|
|
||||||
|
|
||||||
### Note: remember to allow ports for Prometheus to see host.docker.internal:xxxx from within container
|
|
||||||
|
|
||||||
Lister, consumer, indexer
|
|
||||||
`sudo ufw allow 11001`
|
|
||||||
`sudo ufw allow 11002`
|
|
||||||
`sudo ufw allow 11003`
|
|
||||||
|
|
||||||
Postgres
|
|
||||||
`sudo ufw allow 15432`
|
|
||||||
|
|
||||||
# Go to `metrics/prometheus/exporters` and install node and query exporters
|
|
|
@ -1,25 +0,0 @@
|
||||||
version: '3.8'
|
|
||||||
services:
|
|
||||||
prometheus:
|
|
||||||
image: prom/prometheus
|
|
||||||
# needed if mounted in custom volume
|
|
||||||
user: root
|
|
||||||
volumes:
|
|
||||||
- "./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml"
|
|
||||||
- "${PROMETHEUS_DATA_DIR:?specify data dir in .env file}:/prometheus"
|
|
||||||
restart: always
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
ports:
|
|
||||||
- 9090:9090
|
|
||||||
grafana:
|
|
||||||
build:
|
|
||||||
context: ./grafana
|
|
||||||
user: root
|
|
||||||
restart: always
|
|
||||||
extra_hosts:
|
|
||||||
- "host.docker.internal:host-gateway"
|
|
||||||
ports:
|
|
||||||
- 9000:3000
|
|
||||||
volumes:
|
|
||||||
- ${GRAFANA_DATA_DIR:?specify data dir in .env file}:/var/lib/grafana
|
|
|
@ -1,17 +0,0 @@
|
||||||
FROM grafana/grafana:latest
|
|
||||||
|
|
||||||
# Disable Login form or not
|
|
||||||
ENV GF_AUTH_DISABLE_LOGIN_FORM "true"
|
|
||||||
# Allow anonymous authentication or not
|
|
||||||
ENV GF_AUTH_ANONYMOUS_ENABLED "true"
|
|
||||||
# Role of anonymous user
|
|
||||||
ENV GF_AUTH_ANONYMOUS_ORG_ROLE "Admin"
|
|
||||||
# Install plugins here our in your own config file
|
|
||||||
# ENV GF_INSTALL_PLUGINS="<list of plugins seperated by ,"
|
|
||||||
|
|
||||||
# Add provisioning
|
|
||||||
ADD ./provisioning /etc/grafana/provisioning
|
|
||||||
# Add configuration file
|
|
||||||
ADD ./grafana.ini /etc/grafana/grafana.ini
|
|
||||||
# Add dashboard json files
|
|
||||||
ADD ./dashboards /etc/grafana/dashboards
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,23 +0,0 @@
|
||||||
[paths]
|
|
||||||
provisioning = /etc/grafana/provisioning
|
|
||||||
|
|
||||||
[server]
|
|
||||||
enable_gzip = true
|
|
||||||
# To add HTTPS support:
|
|
||||||
#protocol = https
|
|
||||||
#;http_addr =
|
|
||||||
#http_port = 3000
|
|
||||||
#domain = localhost
|
|
||||||
#enforce_domain = false
|
|
||||||
#root_url = https://localhost:3000
|
|
||||||
#router_logging = false
|
|
||||||
#static_root_path = public
|
|
||||||
#cert_file = /etc/certs/cert.pem
|
|
||||||
#cert_key = /etc/certs/cert-key.pem
|
|
||||||
|
|
||||||
[security]
|
|
||||||
# If you want to embed grafana into an iframe for example
|
|
||||||
allow_embedding = true
|
|
||||||
|
|
||||||
[users]
|
|
||||||
default_theme = dark
|
|
|
@ -1,14 +0,0 @@
|
||||||
apiVersion: 1
|
|
||||||
contactPoints:
|
|
||||||
- orgId: 1
|
|
||||||
name: Uabluerail Discord
|
|
||||||
receivers:
|
|
||||||
- uid: edg8hyxtl9s74f
|
|
||||||
type: discord
|
|
||||||
settings:
|
|
||||||
avatar_url: https://cdn.bsky.app/img/avatar/plain/did:plc:ohvstchboonnmbplvwkl33ko/bafkreibyw6gw5ix6p7uerwurrmimrc3nfxwdba3ainto36kjv3ywhdkjdq@jpeg
|
|
||||||
# message: message template
|
|
||||||
# title: title template
|
|
||||||
url: https://discord.com/api/webhooks/1203054943578226709/lt1thL_pKzfG9fgA7reslqV1iaq9L2uYFxRIBJzxot8GAF1NicvWYHEOeMGKeQQOeOB9
|
|
||||||
use_discord_username: false
|
|
||||||
disableResolveMessage: false
|
|
|
@ -1,25 +0,0 @@
|
||||||
# config file version
|
|
||||||
apiVersion: 1
|
|
||||||
|
|
||||||
providers:
|
|
||||||
# <string> an unique provider name
|
|
||||||
- name: My Dashboard
|
|
||||||
# <int> org id. will default to orgId 1 if not specified
|
|
||||||
org_id: 1
|
|
||||||
# <string, required> name of the dashboard folder. Required
|
|
||||||
folder: ''
|
|
||||||
# <string, required> provider type. Required
|
|
||||||
type: 'file'
|
|
||||||
# <bool> disable dashboard deletion
|
|
||||||
disableDeletion: false
|
|
||||||
# <bool> enable dashboard editing
|
|
||||||
editable: true
|
|
||||||
# <int> how often Grafana will scan for changed dashboards
|
|
||||||
updateIntervalSeconds: 5
|
|
||||||
# <bool> allow updating provisioned dashboards from the UI
|
|
||||||
allowUiUpdates: true
|
|
||||||
options:
|
|
||||||
# <string, required> path to dashboard files on disk. Required
|
|
||||||
path: /etc/grafana/dashboards
|
|
||||||
# <bool> use folder names from filesystem to create folders in Grafana
|
|
||||||
foldersFromFilesStructure: true
|
|
|
@ -1,25 +0,0 @@
|
||||||
apiVersion: 1
|
|
||||||
|
|
||||||
datasources:
|
|
||||||
- name: Prometheus
|
|
||||||
type: prometheus
|
|
||||||
access: proxy
|
|
||||||
# Access mode - proxy (server in the UI) or direct (browser in the UI).
|
|
||||||
url: http://prometheus:9090
|
|
||||||
jsonData:
|
|
||||||
httpMethod: POST
|
|
||||||
manageAlerts: true
|
|
||||||
prometheusType: Prometheus
|
|
||||||
prometheusVersion: 2.49.0
|
|
||||||
cacheLevel: 'High'
|
|
||||||
disableRecordingRules: false
|
|
||||||
incrementalQueryOverlapWindow: 10m
|
|
||||||
exemplarTraceIdDestinations:
|
|
||||||
# Field with internal link pointing to data source in Grafana.
|
|
||||||
# datasourceUid value can be anything, but it should be unique across all defined data source uids.
|
|
||||||
- datasourceUid: 000000001
|
|
||||||
name: traceID
|
|
||||||
|
|
||||||
# Field with external link.
|
|
||||||
- name: traceID
|
|
||||||
url: 'http://host.docker.internal:3000/explore?orgId=1&left=%5B%22now-1h%22,%22now%22,%22Jaeger%22,%7B%22query%22:%22$${__value.raw}%22%7D%5D'
|
|
|
@ -1,136 +0,0 @@
|
||||||
# Install Node-exporter
|
|
||||||
|
|
||||||
You'll need to install node exporter for monitoring
|
|
||||||
|
|
||||||
1. Download Node Exporter
|
|
||||||
As first step, you need to download the Node Exporter binary which is available for Linux in the official Prometheus website here. In the website, you will find a table with the list of available builds. Of our interest in this case, is the node_exporter build for Linux AMD64:
|
|
||||||
|
|
||||||
Node Exporter Ubuntu Linux
|
|
||||||
|
|
||||||
In this case the latest available version is the 1.7.0. Copy the .tar.gz URL and download it somewhere in your server using wget or cURL:
|
|
||||||
|
|
||||||
`wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz`
|
|
||||||
|
|
||||||
2. Extract Node Exporter and move binary
|
|
||||||
After downloading the latest version of Node Exporter, proceed to extract the content of the downloaded tar using the following command:
|
|
||||||
|
|
||||||
`tar xvf node_exporter-1.7.0.linux-amd64.tar.gz`
|
|
||||||
The content of the zip will be extracted in the current directory, the extracted directory will contain 3 files:
|
|
||||||
|
|
||||||
LICENSE (license text file)
|
|
||||||
node_exporter (binary)
|
|
||||||
NOTICE (license text file)
|
|
||||||
You only need to move the binary file node_exporter to the /usr/local/bin directory of your system. Switch to the node_exporter directory:
|
|
||||||
|
|
||||||
`cd node_exporter-1.7.0.linux-amd64`
|
|
||||||
And then copy the binary file with the following command:
|
|
||||||
|
|
||||||
`sudo cp node_exporter /usr/local/bin`
|
|
||||||
Then you can remove the directory that we created after extracting the zip file content:
|
|
||||||
|
|
||||||
# Exit current directory
|
|
||||||
`cd ..`
|
|
||||||
|
|
||||||
# Remove the extracted directory
|
|
||||||
`rm -rf ./node_exporter-1.7.0.linux-amd64`
|
|
||||||
3. Create Node Exporter User
|
|
||||||
As a good practice, create an user in the system for Node Exporter:
|
|
||||||
|
|
||||||
`sudo useradd --no-create-home --shell /bin/false node_exporter`
|
|
||||||
And set the owner of the binary node_exporter to the recently created user:
|
|
||||||
|
|
||||||
`sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter`
|
|
||||||
4. Create and start the Node Exporter service
|
|
||||||
The Node Exporter service should always start when the server boots so it will always be available to be scrapped for information. Create the node_exporter.service file with nano:
|
|
||||||
|
|
||||||
`sudo nano /etc/systemd/system/node_exporter.service`
|
|
||||||
And paste the following content in the file:
|
|
||||||
|
|
||||||
```
|
|
||||||
[Unit]
|
|
||||||
Description=Node Exporter
|
|
||||||
Wants=network-online.target
|
|
||||||
After=network-online.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
User=node_exporter
|
|
||||||
Group=node_exporter
|
|
||||||
Type=simple
|
|
||||||
ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/prometheus/node-exporter/
|
|
||||||
Restart=always
|
|
||||||
RestartSec=3
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
```
|
|
||||||
|
|
||||||
Close nano and save the changes to the file. Proceed to reload the daemon with:
|
|
||||||
|
|
||||||
`sudo systemctl daemon-reload`
|
|
||||||
And finally enable the node_exporter service with the following command:
|
|
||||||
|
|
||||||
`sudo systemctl enable node_exporter`
|
|
||||||
And then start the service:
|
|
||||||
|
|
||||||
`sudo systemctl start node_exporter`
|
|
||||||
|
|
||||||
`sudo ufw allow 9090`
|
|
||||||
`sudo ufw allow 9100`
|
|
||||||
|
|
||||||
now go to `http://localhost:9100/metrics`
|
|
||||||
|
|
||||||
# Install query exporter
|
|
||||||
|
|
||||||
Query-exporter is started in indexer's docker.
|
|
||||||
|
|
||||||
To allow viewing it on local network:
|
|
||||||
|
|
||||||
`cd exporters`
|
|
||||||
|
|
||||||
`sudo ufw allow 9560`
|
|
||||||
|
|
||||||
# Install smartmon
|
|
||||||
|
|
||||||
`sudo apt install prometheus-node-exporter-collectors smartmontools`
|
|
||||||
|
|
||||||
Check if your SSD is compatible (your device name may differ, mine is /dev/sda)
|
|
||||||
`sudo smartctl -i /dev/sda`
|
|
||||||
|
|
||||||
Enable SMART on your SSD
|
|
||||||
`sudo smartctl -s on /dev/sda`
|
|
||||||
|
|
||||||
Check smartmon is configured correctly
|
|
||||||
`sudo nano /lib/systemd/system/prometheus-node-exporter-smartmon.service`
|
|
||||||
|
|
||||||
It should be like this:
|
|
||||||
```
|
|
||||||
[Unit]
|
|
||||||
Description=Collect SMART metrics for prometheus-node-exporter
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=oneshot
|
|
||||||
Environment=TMPDIR=/var/lib/prometheus/node-exporter
|
|
||||||
ExecStart=/bin/bash -c "/usr/share/prometheus-node-exporter-collectors/smartmon.sh | sponge /var/lib/prometheus/node-exporter/smartmon.prom"
|
|
||||||
```
|
|
||||||
|
|
||||||
Start the service
|
|
||||||
`systemctl start prometheus-node-exporter-smartmon.service`
|
|
||||||
|
|
||||||
Open node exporter
|
|
||||||
`sudo nano /etc/systemd/system/node_exporter.service`
|
|
||||||
|
|
||||||
Check it contains the `--collector.textfile.directory` parameter
|
|
||||||
```
|
|
||||||
ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/prometheus/node-exporter/
|
|
||||||
```
|
|
||||||
|
|
||||||
Start the smartmon service
|
|
||||||
`systemctl start prometheus-node-exporter-smartmon.service`
|
|
||||||
|
|
||||||
Check the file contains correct metrics
|
|
||||||
`nano /var/lib/prometheus/node-exporter/smartmon.prom`
|
|
||||||
|
|
||||||
Restart the node exporter
|
|
||||||
`sudo systemctl restart node_exporter`
|
|
||||||
|
|
||||||
Metrics should display on `http://localhost:9100/metrics`
|
|
|
@ -1,71 +0,0 @@
|
||||||
databases:
|
|
||||||
db1:
|
|
||||||
dsn: env:POSTGRES_URL
|
|
||||||
|
|
||||||
metrics:
|
|
||||||
repos_fully_indexed:
|
|
||||||
type: gauge
|
|
||||||
description: Repositories fully indexed
|
|
||||||
repos_seen:
|
|
||||||
type: gauge
|
|
||||||
description: Repositories seen
|
|
||||||
repos_failed:
|
|
||||||
type: gauge
|
|
||||||
description: Repositories that we failed to index
|
|
||||||
consumer_bad_records:
|
|
||||||
type: gauge
|
|
||||||
description: Records received from firehose that we failed to process
|
|
||||||
labels: [pds, error]
|
|
||||||
# posts_lang:
|
|
||||||
# type: summary
|
|
||||||
# description: Posts by language
|
|
||||||
# labels: [uk, lt, et, lv, pl, ga, fi, sv,
|
|
||||||
# en, jp, de, fr, pt, es, nl, ko, tr, zh, ru]
|
|
||||||
|
|
||||||
queries:
|
|
||||||
query1:
|
|
||||||
interval: 30
|
|
||||||
databases: [db1]
|
|
||||||
metrics: [repos_fully_indexed]
|
|
||||||
sql: >
|
|
||||||
select count(*) as repos_fully_indexed
|
|
||||||
from repos left join pds on repos.pds = pds.id
|
|
||||||
where failed_attempts < 3
|
|
||||||
and last_indexed_rev <> ''
|
|
||||||
and (last_indexed_rev >= first_rev_since_reset
|
|
||||||
or first_rev_since_reset is null or first_rev_since_reset = '')
|
|
||||||
and (repos.first_cursor_since_reset >= pds.first_cursor_since_reset
|
|
||||||
or repos.first_cursor_since_reset is null or repos.first_cursor_since_reset = 0);
|
|
||||||
query2:
|
|
||||||
interval: 30
|
|
||||||
databases: [db1]
|
|
||||||
metrics: [repos_seen]
|
|
||||||
sql: select count(*) as repos_seen from repos;
|
|
||||||
query3:
|
|
||||||
interval: 30
|
|
||||||
databases: [db1]
|
|
||||||
metrics: [repos_failed]
|
|
||||||
sql: select count(*) as repos_failed from repos where failed_attempts >= 3;
|
|
||||||
# query4:
|
|
||||||
# interval: 300
|
|
||||||
# databases: [db1]
|
|
||||||
# metrics: [posts_lang]
|
|
||||||
# sql: select count(*) as uk from records where collection in ('app.bsky.feed.post') and content::text like '%"langs": ["uk"]%';
|
|
||||||
bad_records:
|
|
||||||
interval: 30
|
|
||||||
databases: [db1]
|
|
||||||
metrics: [consumer_bad_records]
|
|
||||||
sql: |
|
|
||||||
select count(*) as consumer_bad_records, host as pds, error
|
|
||||||
from (
|
|
||||||
select id, created_at, pds, cursor, content,
|
|
||||||
regexp_replace(regexp_replace(regexp_replace(error,
|
|
||||||
'did:[\:a-z0-9]+', 'did:xxx', 'g'),
|
|
||||||
'json\.RawMessage\{[^}]+\}', 'json.RawMessage{...}', 'g'),
|
|
||||||
'[0-9]{1,3}(\.[0-9]{1,3}){3}\:[0-9]+', '<IP>\:<port>', 'g') as error
|
|
||||||
from bad_records
|
|
||||||
)
|
|
||||||
join
|
|
||||||
pds
|
|
||||||
on pds=pds.id
|
|
||||||
group by error, host;
|
|
|
@ -1 +0,0 @@
|
||||||
POSTGRES_PASSWORD='your password'
|
|
|
@ -1,16 +0,0 @@
|
||||||
global:
|
|
||||||
scrape_interval: 10s
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: prometheus
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- host.docker.internal:9090
|
|
||||||
- job_name: node
|
|
||||||
static_configs:
|
|
||||||
- targets: ['host.docker.internal:9100']
|
|
||||||
- job_name: indexer
|
|
||||||
static_configs:
|
|
||||||
- targets: [ host.docker.internal:11001, host.docker.internal:11002, host.docker.internal:11003 ]
|
|
||||||
- job_name: db
|
|
||||||
static_configs:
|
|
||||||
- targets: ['host.docker.internal:9560']
|
|
56
pds/pds.go
56
pds/pds.go
|
@ -1,56 +0,0 @@
|
||||||
package pds
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"path/filepath"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gorm.io/gorm"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/models"
|
|
||||||
)
|
|
||||||
|
|
||||||
const Unknown models.ID = 0
|
|
||||||
|
|
||||||
var whitelist []string = []string{
|
|
||||||
"https://bsky.social",
|
|
||||||
"https://*.bsky.network",
|
|
||||||
"https://*",
|
|
||||||
}
|
|
||||||
|
|
||||||
type PDS struct {
|
|
||||||
ID models.ID `gorm:"primarykey"`
|
|
||||||
CreatedAt time.Time
|
|
||||||
UpdatedAt time.Time
|
|
||||||
Host string `gorm:"uniqueIndex"`
|
|
||||||
Cursor int64
|
|
||||||
FirstCursorSinceReset int64
|
|
||||||
LastList time.Time
|
|
||||||
CrawlLimit int
|
|
||||||
Disabled bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func AutoMigrate(db *gorm.DB) error {
|
|
||||||
return db.AutoMigrate(&PDS{})
|
|
||||||
}
|
|
||||||
|
|
||||||
func EnsureExists(ctx context.Context, db *gorm.DB, host string) (*PDS, error) {
|
|
||||||
if !IsWhitelisted(host) {
|
|
||||||
return nil, fmt.Errorf("host %q is not whitelisted", host)
|
|
||||||
}
|
|
||||||
remote := PDS{Host: host}
|
|
||||||
if err := db.Model(&remote).Where(&PDS{Host: host}).FirstOrCreate(&remote).Error; err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get PDS record from DB for %q: %w", remote.Host, err)
|
|
||||||
}
|
|
||||||
return &remote, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func IsWhitelisted(host string) bool {
|
|
||||||
for _, p := range whitelist {
|
|
||||||
if match, _ := filepath.Match(p, host); match {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
352
repo/mst.go
352
repo/mst.go
|
@ -1,352 +0,0 @@
|
||||||
package repo
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
|
|
||||||
"github.com/ipfs/go-cid"
|
|
||||||
"github.com/ipld/go-car"
|
|
||||||
"github.com/ipld/go-ipld-prime/codec/dagcbor"
|
|
||||||
"github.com/ipld/go-ipld-prime/codec/dagjson"
|
|
||||||
"github.com/ipld/go-ipld-prime/datamodel"
|
|
||||||
"github.com/ipld/go-ipld-prime/node/basicnode"
|
|
||||||
)
|
|
||||||
|
|
||||||
var ErrInvalidSignature = fmt.Errorf("commit signature is not valid")
|
|
||||||
|
|
||||||
func ExtractRecords(ctx context.Context, b io.Reader, signingKey string) (map[string]json.RawMessage, error) {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
|
|
||||||
r, err := car.NewCarReader(b)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to construct CAR reader: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
blocks := map[cid.Cid][]byte{}
|
|
||||||
for {
|
|
||||||
block, err := r.Next()
|
|
||||||
if errors.Is(err, io.EOF) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("reading next block: %w", err)
|
|
||||||
}
|
|
||||||
c, err := block.Cid().Prefix().Sum(block.RawData())
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to calculate CID from content")
|
|
||||||
}
|
|
||||||
if c.Equals(block.Cid()) {
|
|
||||||
blocks[block.Cid()] = block.RawData()
|
|
||||||
} else {
|
|
||||||
log.Debug().Str("cid", block.Cid().String()).
|
|
||||||
Msgf("CID doesn't match block content: %s != %s", block.Cid().String(), c.String())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
records := map[string]cid.Cid{}
|
|
||||||
if len(r.Header.Roots) == 0 {
|
|
||||||
return nil, fmt.Errorf("CAR has zero roots specified")
|
|
||||||
}
|
|
||||||
|
|
||||||
// https://atproto.com/specs/repository specifies that the first root
|
|
||||||
// must be a commit object. Meaning of subsequent roots is not yet defined.
|
|
||||||
root := r.Header.Roots[0]
|
|
||||||
|
|
||||||
// TODO: verify that a root is a commit record and validate signature
|
|
||||||
if _, found := blocks[root]; !found {
|
|
||||||
return nil, fmt.Errorf("root block is missing")
|
|
||||||
}
|
|
||||||
valid, err := verifyCommitSignature(ctx, blocks[root], signingKey)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("commit signature verification failed: %w", err)
|
|
||||||
}
|
|
||||||
if !valid {
|
|
||||||
return nil, ErrInvalidSignature
|
|
||||||
}
|
|
||||||
|
|
||||||
cids, err := findRecords(blocks, root, nil, nil, 0)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
for k, v := range cids {
|
|
||||||
records[k] = v
|
|
||||||
}
|
|
||||||
|
|
||||||
res := map[string]json.RawMessage{}
|
|
||||||
for k, c := range records {
|
|
||||||
builder := basicnode.Prototype.Any.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(blocks[c])); err != nil {
|
|
||||||
return nil, fmt.Errorf("unmarshaling %q: %w", c.String(), err)
|
|
||||||
}
|
|
||||||
w := bytes.NewBuffer(nil)
|
|
||||||
if err := (dagjson.EncodeOptions{EncodeLinks: true, EncodeBytes: true}).Encode(builder.Build(), w); err != nil {
|
|
||||||
return nil, fmt.Errorf("marshaling %q as JSON: %w", c.String(), err)
|
|
||||||
}
|
|
||||||
res[k] = w.Bytes()
|
|
||||||
}
|
|
||||||
return res, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
const maxDepth = 128
|
|
||||||
|
|
||||||
func findRecords(blocks map[cid.Cid][]byte, root cid.Cid, key []byte, visited map[cid.Cid]bool, depth int) (map[string]cid.Cid, error) {
|
|
||||||
if depth > maxDepth {
|
|
||||||
return nil, fmt.Errorf("reached maximum depth at %q", root.String())
|
|
||||||
}
|
|
||||||
|
|
||||||
if visited == nil {
|
|
||||||
visited = map[cid.Cid]bool{}
|
|
||||||
}
|
|
||||||
|
|
||||||
visited[root] = true
|
|
||||||
|
|
||||||
builder := basicnode.Prototype.Any.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(blocks[root])); err != nil {
|
|
||||||
return nil, fmt.Errorf("unmarshaling %q: %w", root.String(), err)
|
|
||||||
}
|
|
||||||
node := builder.Build()
|
|
||||||
|
|
||||||
if node.Kind() != datamodel.Kind_Map {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
m, err := parseMap(node)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, ok := m["$type"]; ok {
|
|
||||||
return map[string]cid.Cid{string(key): root}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if d, ok := m["data"]; ok {
|
|
||||||
// Commit record
|
|
||||||
if d.Kind() == datamodel.Kind_Link {
|
|
||||||
l, _ := d.AsLink()
|
|
||||||
if l != nil {
|
|
||||||
c, err := cid.Parse([]byte(l.Binary()))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse %q as CID: %w", l.String(), err)
|
|
||||||
}
|
|
||||||
if _, ok := blocks[c]; ok && !visited[c] {
|
|
||||||
return findRecords(blocks, c, nil, visited, depth+1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if entries, ok := m["e"]; ok {
|
|
||||||
// MST node
|
|
||||||
r := map[string]cid.Cid{}
|
|
||||||
iter := entries.ListIterator()
|
|
||||||
key = []byte{}
|
|
||||||
for !iter.Done() {
|
|
||||||
_, item, err := iter.Next()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to read the next list item in block %q: %w", root.String(), err)
|
|
||||||
}
|
|
||||||
if item.Kind() != datamodel.Kind_Map {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
m, err := parseMap(item)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, field := range []string{"k", "p", "v", "t"} {
|
|
||||||
if _, ok := m[field]; !ok {
|
|
||||||
return nil, fmt.Errorf("TreeEntry is missing field %q", field)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prefixLen, err := m["p"].AsInt()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("m[\"p\"].AsInt(): %w", err)
|
|
||||||
}
|
|
||||||
prefixPart, err := m["k"].AsBytes()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("m[\"k\"].AsBytes(): %w", err)
|
|
||||||
}
|
|
||||||
val, err := m["v"].AsLink()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("m[\"v\"].AsLink(): %w", err)
|
|
||||||
}
|
|
||||||
c, err := cid.Parse([]byte(val.Binary()))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse %q as CID: %w", val.String(), err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(key) == 0 {
|
|
||||||
// First entry, must have a full key.
|
|
||||||
if prefixLen != 0 {
|
|
||||||
return nil, fmt.Errorf("incomplete key in the first entry")
|
|
||||||
}
|
|
||||||
key = prefixPart
|
|
||||||
}
|
|
||||||
|
|
||||||
if prefixLen > int64(len(key)) {
|
|
||||||
return nil, fmt.Errorf("specified prefix length is larger than the key length: %d > %d", prefixLen, len(key))
|
|
||||||
}
|
|
||||||
key = append(key[:prefixLen], prefixPart...)
|
|
||||||
|
|
||||||
if _, ok := blocks[c]; ok && !visited[c] {
|
|
||||||
results, err := findRecords(blocks, c, key, visited, depth+1)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
for k, v := range results {
|
|
||||||
r[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if m["t"] != nil && m["t"].Kind() == datamodel.Kind_Link {
|
|
||||||
subtree, err := m["t"].AsLink()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("m[\"t\"].AsLink(): %w", err)
|
|
||||||
}
|
|
||||||
subtreeCid, err := cid.Parse([]byte(subtree.Binary()))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse %q as CID: %w", val.String(), err)
|
|
||||||
}
|
|
||||||
if _, ok := blocks[subtreeCid]; ok && !visited[subtreeCid] {
|
|
||||||
results, err := findRecords(blocks, subtreeCid, key, visited, depth+1)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
for k, v := range results {
|
|
||||||
r[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
left, ok := m["l"]
|
|
||||||
if ok && left.Kind() == datamodel.Kind_Link {
|
|
||||||
l, _ := left.AsLink()
|
|
||||||
if l != nil {
|
|
||||||
c, err := cid.Parse([]byte(l.Binary()))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse %q as CID: %w", l.String(), err)
|
|
||||||
}
|
|
||||||
if _, ok := blocks[c]; ok && !visited[c] {
|
|
||||||
results, err := findRecords(blocks, c, nil, visited, depth+1)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
for k, v := range results {
|
|
||||||
r[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return r, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, fmt.Errorf("unrecognized block %q", root.String())
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseMap(node datamodel.Node) (map[string]datamodel.Node, error) {
|
|
||||||
if node.Kind() != datamodel.Kind_Map {
|
|
||||||
return nil, fmt.Errorf("not a map")
|
|
||||||
}
|
|
||||||
|
|
||||||
m := map[string]datamodel.Node{}
|
|
||||||
iter := node.MapIterator()
|
|
||||||
for !iter.Done() {
|
|
||||||
k, v, err := iter.Next()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("iterating over map fields: %w", err)
|
|
||||||
}
|
|
||||||
if k.Kind() != datamodel.Kind_String {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ks, _ := k.AsString()
|
|
||||||
m[ks] = v
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var ErrZeroBlocks = fmt.Errorf("zero blocks found")
|
|
||||||
|
|
||||||
func GetRev(ctx context.Context, b io.Reader) (string, error) {
|
|
||||||
r, err := car.NewCarReader(b)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to construct CAR reader: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(r.Header.Roots) == 0 {
|
|
||||||
return "", fmt.Errorf("no roots specified in CAR header")
|
|
||||||
}
|
|
||||||
|
|
||||||
blocks := map[cid.Cid][]byte{}
|
|
||||||
for {
|
|
||||||
block, err := r.Next()
|
|
||||||
if errors.Is(err, io.EOF) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("reading next block: %w", err)
|
|
||||||
}
|
|
||||||
c, err := block.Cid().Prefix().Sum(block.RawData())
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to calculate CID from content")
|
|
||||||
}
|
|
||||||
if c.Equals(block.Cid()) {
|
|
||||||
blocks[block.Cid()] = block.RawData()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(blocks) == 0 {
|
|
||||||
return "", ErrZeroBlocks
|
|
||||||
}
|
|
||||||
|
|
||||||
builder := basicnode.Prototype.Any.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(blocks[r.Header.Roots[0]])); err != nil {
|
|
||||||
return "", fmt.Errorf("unmarshaling %q: %w", r.Header.Roots[0].String(), err)
|
|
||||||
}
|
|
||||||
node := builder.Build()
|
|
||||||
|
|
||||||
v, err := node.LookupByString("rev")
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("looking up 'rev' field: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
s, err := v.AsString()
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("rev.AsString(): %w", err)
|
|
||||||
}
|
|
||||||
return s, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetLang(ctx context.Context, value json.RawMessage) ([]string, time.Time, error) {
|
|
||||||
var content struct {
|
|
||||||
Type string `json:"$type"`
|
|
||||||
Langs []string `json:"langs"`
|
|
||||||
Time string `json:"createdAt"`
|
|
||||||
}
|
|
||||||
err := json.Unmarshal([]byte(value), &content)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, time.Now(), fmt.Errorf("failed to extract lang from content: %w", err)
|
|
||||||
}
|
|
||||||
if content.Type != "app.bsky.feed.post" {
|
|
||||||
return nil, time.Now(), errors.New("not a post")
|
|
||||||
}
|
|
||||||
|
|
||||||
var timestamp time.Time
|
|
||||||
if t, err := time.Parse(time.RFC3339, content.Time); err != nil {
|
|
||||||
return nil, time.Now(), fmt.Errorf("failed to extract time from content: %w", err)
|
|
||||||
} else {
|
|
||||||
timestamp = t
|
|
||||||
}
|
|
||||||
return content.Langs, timestamp, nil
|
|
||||||
}
|
|
105
repo/repo.go
105
repo/repo.go
|
@ -1,105 +0,0 @@
|
||||||
package repo
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gorm.io/gorm"
|
|
||||||
|
|
||||||
"github.com/uabluerail/indexer/models"
|
|
||||||
"github.com/uabluerail/indexer/pds"
|
|
||||||
"github.com/uabluerail/indexer/util/resolver"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Repo struct {
|
|
||||||
ID models.ID `gorm:"primarykey"`
|
|
||||||
CreatedAt time.Time
|
|
||||||
UpdatedAt time.Time
|
|
||||||
PDS models.ID `gorm:"default:0;index:rev_state_index,priority:2;index:was_indexed,priority:2"`
|
|
||||||
DID string `gorm:"uniqueIndex;column:did"`
|
|
||||||
LastIndexedRev string `gorm:"index:rev_state_index,expression:(last_indexed_rev < first_rev_since_reset),priority:1;index:was_indexed,expression:(last_indexed_rev is null OR last_indexed_rev = ''),priority:1"`
|
|
||||||
FirstRevSinceReset string
|
|
||||||
LastFirehoseRev string
|
|
||||||
FirstCursorSinceReset int64
|
|
||||||
TombstonedAt time.Time
|
|
||||||
LastIndexAttempt time.Time
|
|
||||||
LastError string
|
|
||||||
FailedAttempts int `gorm:"default:0"`
|
|
||||||
LastKnownKey string
|
|
||||||
}
|
|
||||||
|
|
||||||
type Record struct {
|
|
||||||
ID models.ID
|
|
||||||
CreatedAt time.Time `gorm:"not null"`
|
|
||||||
UpdatedAt time.Time `gorm:"autoUpdateTime:false"`
|
|
||||||
Repo models.ID `gorm:"index:idx_repo_record_key,priority:1;not null;index:idx_repo_rev"`
|
|
||||||
Collection string `gorm:"index:idx_repo_record_key,priority:2;not null"`
|
|
||||||
Rkey string `gorm:"index:idx_repo_record_key,priority:3"`
|
|
||||||
AtRev string `gorm:"index:idx_repo_rev"`
|
|
||||||
Content json.RawMessage `gorm:"type:JSONB"`
|
|
||||||
Deleted bool `gorm:"default:false"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func AutoMigrate(db *gorm.DB) error {
|
|
||||||
return db.AutoMigrate(&Repo{}, &Record{})
|
|
||||||
}
|
|
||||||
|
|
||||||
func EnsureExists(ctx context.Context, db *gorm.DB, did string) (*Repo, bool, error) {
|
|
||||||
r := Repo{}
|
|
||||||
if err := db.Model(&r).Where(&Repo{DID: did}).Take(&r).Error; err == nil {
|
|
||||||
// Already have a row, just return it.
|
|
||||||
return &r, false, nil
|
|
||||||
} else {
|
|
||||||
if !errors.Is(err, gorm.ErrRecordNotFound) {
|
|
||||||
return nil, false, fmt.Errorf("querying DB: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// No row yet, so we need to create one (keeping in mind that it can be created
|
|
||||||
// concurrently by someone else).
|
|
||||||
// 1) resolve did (i.e., query PLC)
|
|
||||||
// 2) get PDS address from didDoc and ensure we have a record for it
|
|
||||||
// 3) in a transaction, check if we have a record for the repo
|
|
||||||
// if we don't - just create a record
|
|
||||||
// if we do - compare PDS IDs
|
|
||||||
// if they don't match - also reset FirstRevSinceReset
|
|
||||||
|
|
||||||
u, pubKey, err := resolver.GetPDSEndpointAndPublicKey(ctx, did)
|
|
||||||
if err != nil {
|
|
||||||
return nil, false, fmt.Errorf("fetching DID Document: %w", err)
|
|
||||||
}
|
|
||||||
if u.Path == "/" {
|
|
||||||
// Discard inginificant path to avoid string comparison mismatches,
|
|
||||||
// as well as glob pattern false negatives.
|
|
||||||
u.Path = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
remote, err := pds.EnsureExists(ctx, db, u.String())
|
|
||||||
if err != nil {
|
|
||||||
return nil, false, fmt.Errorf("failed to get PDS record from DB for %q: %w", u.String(), err)
|
|
||||||
}
|
|
||||||
r = Repo{
|
|
||||||
DID: did,
|
|
||||||
PDS: models.ID(remote.ID),
|
|
||||||
LastKnownKey: pubKey,
|
|
||||||
}
|
|
||||||
created := false
|
|
||||||
err = db.Transaction(func(tx *gorm.DB) error {
|
|
||||||
result := tx.Model(&r).Where(&Repo{DID: r.DID}).FirstOrCreate(&r)
|
|
||||||
if err := result.Error; err != nil {
|
|
||||||
return fmt.Errorf("looking for repo: %w", err)
|
|
||||||
}
|
|
||||||
if r.PDS != models.ID(remote.ID) {
|
|
||||||
return tx.Model(&r).Select("FirstRevSinceReset").Updates(&Repo{FirstRevSinceReset: ""}).Error
|
|
||||||
}
|
|
||||||
created = result.RowsAffected > 0
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return nil, false, fmt.Errorf("upserting repo record: %w", err)
|
|
||||||
}
|
|
||||||
return &r, created, nil
|
|
||||||
}
|
|
|
@ -1,223 +0,0 @@
|
||||||
package repo
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"crypto"
|
|
||||||
"crypto/ecdsa"
|
|
||||||
"crypto/elliptic"
|
|
||||||
"crypto/sha256"
|
|
||||||
"encoding/binary"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"math/big"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"gitlab.com/yawning/secp256k1-voi/secec"
|
|
||||||
|
|
||||||
"github.com/ipfs/go-cid"
|
|
||||||
"github.com/ipld/go-ipld-prime/codec/dagcbor"
|
|
||||||
"github.com/ipld/go-ipld-prime/datamodel"
|
|
||||||
"github.com/ipld/go-ipld-prime/node/basicnode"
|
|
||||||
"github.com/multiformats/go-multibase"
|
|
||||||
"github.com/multiformats/go-multicodec"
|
|
||||||
)
|
|
||||||
|
|
||||||
type SignatureValidator func(digest []byte, sig []byte) (bool, error)
|
|
||||||
|
|
||||||
func parseSigningKey(ctx context.Context, key string) (SignatureValidator, error) {
|
|
||||||
log := zerolog.Ctx(ctx)
|
|
||||||
|
|
||||||
// const didKey = "did:key:"
|
|
||||||
|
|
||||||
// if !strings.HasPrefix(key, didKey) {
|
|
||||||
// return nil, fmt.Errorf("expected the key %q to have prefix %q", key, didKey)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// key = strings.TrimPrefix(key, didKey)
|
|
||||||
enc, val, err := multibase.Decode(key)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to decode key data: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if enc != multibase.Base58BTC {
|
|
||||||
log.Info().Msgf("unexpected key encoding: %v", enc)
|
|
||||||
}
|
|
||||||
|
|
||||||
buf := bytes.NewBuffer(val)
|
|
||||||
kind, err := binary.ReadUvarint(buf)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse key type: %w", err)
|
|
||||||
}
|
|
||||||
data, _ := io.ReadAll(buf)
|
|
||||||
|
|
||||||
switch multicodec.Code(kind) {
|
|
||||||
case multicodec.P256Pub:
|
|
||||||
x, y := elliptic.UnmarshalCompressed(elliptic.P256(), data)
|
|
||||||
return func(digest, sig []byte) (bool, error) {
|
|
||||||
pk := &ecdsa.PublicKey{
|
|
||||||
Curve: elliptic.P256(),
|
|
||||||
X: x,
|
|
||||||
Y: y,
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(sig) != 64 {
|
|
||||||
return false, fmt.Errorf("unexpected signature length: %d != 64", len(sig))
|
|
||||||
}
|
|
||||||
r := big.NewInt(0).SetBytes(sig[:32])
|
|
||||||
s := big.NewInt(0).SetBytes(sig[32:])
|
|
||||||
return ecdsa.Verify(pk, digest, r, s), nil
|
|
||||||
}, nil
|
|
||||||
case multicodec.Secp256k1Pub:
|
|
||||||
pk, err := secec.NewPublicKey(data)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse secp256k public key: %w", err)
|
|
||||||
}
|
|
||||||
return func(digest, sig []byte) (bool, error) {
|
|
||||||
return pk.Verify(digest, sig, &secec.ECDSAOptions{
|
|
||||||
Hash: crypto.SHA256,
|
|
||||||
Encoding: secec.EncodingCompact,
|
|
||||||
RejectMalleable: true,
|
|
||||||
}), nil
|
|
||||||
}, nil
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("unsupported key type %q", multicodec.Code(kind))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func verifyCommitSignature(ctx context.Context, data []byte, key string) (bool, error) {
|
|
||||||
validateSignature, err := parseSigningKey(ctx, key)
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse the key: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Commit struct {
|
|
||||||
DID string
|
|
||||||
Version int
|
|
||||||
Data cid.Cid
|
|
||||||
Rev string
|
|
||||||
Prev *cid.Cid
|
|
||||||
Sig []byte
|
|
||||||
}
|
|
||||||
|
|
||||||
builder := basicnode.Prototype.Any.NewBuilder()
|
|
||||||
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(data)); err != nil {
|
|
||||||
return false, fmt.Errorf("unmarshaling commit: %w", err)
|
|
||||||
}
|
|
||||||
node := builder.Build()
|
|
||||||
|
|
||||||
if node.Kind() != datamodel.Kind_Map {
|
|
||||||
return false, fmt.Errorf("commit must be a Map, got %s instead", node.Kind())
|
|
||||||
}
|
|
||||||
|
|
||||||
m, err := parseMap(node)
|
|
||||||
if err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
|
|
||||||
commit := Commit{}
|
|
||||||
|
|
||||||
if n, found := m["version"]; !found {
|
|
||||||
return false, fmt.Errorf("missing \"version\"")
|
|
||||||
} else {
|
|
||||||
v, err := n.AsInt()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse \"version\": %w", err)
|
|
||||||
}
|
|
||||||
commit.Version = int(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, found := m["did"]; !found {
|
|
||||||
return false, fmt.Errorf("missing \"did\"")
|
|
||||||
} else {
|
|
||||||
v, err := n.AsString()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse \"did\": %w", err)
|
|
||||||
}
|
|
||||||
commit.DID = v
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, found := m["data"]; !found {
|
|
||||||
return false, fmt.Errorf("missing \"data\"")
|
|
||||||
} else {
|
|
||||||
v, err := n.AsLink()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse \"data\": %w", err)
|
|
||||||
}
|
|
||||||
c, err := cid.Parse([]byte(v.Binary()))
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to convert \"data\" to CID: %w", err)
|
|
||||||
}
|
|
||||||
commit.Data = c
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, found := m["rev"]; !found {
|
|
||||||
return false, fmt.Errorf("missing \"rev\"")
|
|
||||||
} else {
|
|
||||||
v, err := n.AsString()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse \"rev\": %w", err)
|
|
||||||
}
|
|
||||||
commit.Rev = v
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, found := m["prev"]; !found {
|
|
||||||
return false, fmt.Errorf("missing \"prev\"")
|
|
||||||
} else {
|
|
||||||
if !n.IsNull() {
|
|
||||||
v, err := n.AsLink()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse \"prev\": %w", err)
|
|
||||||
}
|
|
||||||
c, err := cid.Parse([]byte(v.Binary()))
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to convert \"prev\" to CID: %w", err)
|
|
||||||
}
|
|
||||||
commit.Prev = &c
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, found := m["sig"]; !found {
|
|
||||||
return false, fmt.Errorf("missing \"sig\"")
|
|
||||||
} else {
|
|
||||||
v, err := n.AsBytes()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to parse \"sig\": %w", err)
|
|
||||||
}
|
|
||||||
commit.Sig = v
|
|
||||||
}
|
|
||||||
|
|
||||||
if commit.Version != 3 {
|
|
||||||
return false, fmt.Errorf("unknown commit version %d", commit.Version)
|
|
||||||
}
|
|
||||||
|
|
||||||
unsignedBuilder := basicnode.Prototype.Map.NewBuilder()
|
|
||||||
mb, err := unsignedBuilder.BeginMap(int64(len(m) - 1))
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("initializing a map for unsigned commit: %w", err)
|
|
||||||
}
|
|
||||||
// XXX: signature validation depends on this specific order of keys in the map.
|
|
||||||
for _, k := range []string{"did", "rev", "data", "prev", "version"} {
|
|
||||||
if k == "sig" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err := mb.AssembleKey().AssignString(k); err != nil {
|
|
||||||
return false, fmt.Errorf("failed to assemble key %q: %w", k, err)
|
|
||||||
}
|
|
||||||
if err := mb.AssembleValue().AssignNode(m[k]); err != nil {
|
|
||||||
return false, fmt.Errorf("failed to assemble value for key %q: %w", k, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err := mb.Finish(); err != nil {
|
|
||||||
return false, fmt.Errorf("failed to finalize the map: %w", err)
|
|
||||||
}
|
|
||||||
unsignedNode := unsignedBuilder.Build()
|
|
||||||
|
|
||||||
buf := bytes.NewBuffer(nil)
|
|
||||||
if err := (&dagcbor.EncodeOptions{AllowLinks: true}).Encode(unsignedNode, buf); err != nil {
|
|
||||||
return false, fmt.Errorf("failed to serialize unsigned commit: %w", err)
|
|
||||||
}
|
|
||||||
unsignedBytes := buf.Bytes()
|
|
||||||
unsignedHash := sha256.Sum256(unsignedBytes)
|
|
||||||
return validateSignature(unsignedHash[:], commit.Sig)
|
|
||||||
}
|
|
Loading…
Reference in New Issue