Remove everything that's not needed for PLC mirror

main
Max Ignatenko 2024-09-25 22:00:00 +01:00
parent 56727bbe11
commit 9b877c1524
89 changed files with 12 additions and 38231 deletions

View File

@ -1,6 +1,4 @@
.PHONY: all build up update down start-db status logs psql init-db start-plc wait-for-plc .PHONY: all build up update down start-db status logs
# ---------------------------- Docker ----------------------------
all: all:
go test -v ./... go test -v ./...
@ -29,71 +27,3 @@ status:
logs: logs:
@docker compose logs -f -n 50 @docker compose logs -f -n 50
start-plc: .env
@docker compose up -d --build postgres plc
wait-for-plc:
@. ./.env && while ! curl -s --fail-with-body http://$${METRICS_ADDR:-localhost}:11004/ready; do sleep 10; done
# ---------------------------- Docker ----------------------------
# ---------------------------- Database ----------------------------
psql:
@docker compose up -d postgres
@docker compose exec -it postgres psql -U postgres -d bluesky
init-db: .env
@docker compose up -d --build lister
@sleep 10
@docker compose stop lister
@cat ./db-migration/init.sql | docker exec -i "$$(docker compose ps --format '{{.Names}}' postgres)" psql -U postgres -d bluesky
sqltop:
watch -n 1 'cat top.sql|docker compose exec -i postgres psql -U postgres -d bluesky'
sqldu:
cat du.sql | docker compose exec -iT postgres psql -U postgres -d bluesky
# ---------------------------- Database ----------------------------
# ---------------------------- CSV Export ----------------------------
# NOT RECOMMENDED TO RUN for the firts time on hot live db, will chomp all available IO. stop services first
csv-export:
@docker compose up -d postgres
@sleep 10
@nohup ./csv_export.sh > csv_export.out &
csv-iexport:
@docker compose up -d postgres
@sleep 10
@nohup ./csv_iexport.sh > csv_iexport.out &
csv-iexport-month:
@docker compose up -d postgres
@sleep 10
@nohup ./csv_iexport_month.sh > csv_iexport_month.out &
kill-csv-export:
@kill -9 `pgrep csv_export.sh`
kill-csv-iexport:
@kill -9 `pgrep csv_iexport.sh`
kill-csv-iexport-month:
@kill -9 `pgrep csv_iexport_month.sh`
# ---------------------------- CSV Export ----------------------------
dash-export:
@./dashboards/export.sh
dash-import:
@./dashboards/update.sh

106
README.md
View File

@ -1,52 +1,7 @@
# Bluesky indexer # PLC mirror
This is a bunch of code that can download all of Bluesky into a giant table in Syncs PLC operations log into a local table, and allows resolving `did:plc:`
PostgreSQL. DIDs without putting strain on https://plc.directory and hitting rate limits.
The structure of that table is roughly `(repo, collection, rkey) -> JSON`, and
it is a good idea to partition it by collection.
## System requirements
NOTE: all of this is valid as of April 2024, when Bluesky has ~5.5M accounts,
~1.2B records total, and average daily peak of ~100 commits/s.
* One decent SATA SSD is plenty fast to keep up. Preferably a dedicated one
(definitely not the same that your system is installed on). There will be a
lot of writes happening, so the total durability of the disk will be used up
at non-negligible rate.
* 16GB of RAM, but the more the better, obviously.
* ZFS with compression enabled is highly recommended, but not strictly
necessary.
* Compression will cut down on IO bandwidth quite a bit, as well as on used
disk space. On a compressed FS the whole database takes up about 270GB,
without compression - almost 3 times as much.
## Overview of components
### Lister
Once a day get a list of all repos from all known PDSs and adds any that are
missing to the database.
### Consumer
Connects to firehose of each PDS and stores all received records in the
database.
If `CONSUMER_RELAYS` is specified, it will also add any new PDSs to the database
that have records sent through a relay.
### Record indexer
Goes over all repos that might have missing data, gets a full checkout from the
PDS and adds all missing records to the database.
### PLC mirror
Syncs PLC operations log into a local table, and allows other components to
resolve `did:plc:` DIDs without putting strain on https://plc.directory and
hitting rate limits.
## Setup ## Setup
@ -54,57 +9,12 @@ hitting rate limits.
* Copy `example.env` to `.env` and edit it to your liking. * Copy `example.env` to `.env` and edit it to your liking.
* `POSTGRES_PASSWORD` can be anything, it will be used on the first start of * `POSTGRES_PASSWORD` can be anything, it will be used on the first start of
`postgres` container to initialize the database. `postgres` container to initialize the database.
* Optional: copy `docker-compose.override.yml.example` to
`docker-compose.override.yml` to change some parts of `docker-compose.yml`
without actually editing it (and introducing possibility of merge conflicts
later on).
* `make start-plc`
* This will start PostgreSQL and PLC mirror
* `make wait-for-plc`
* This will wait until PLC mirror has fully replicated the operations log.
That's gonna take a few hours.
* Technically you can start everything before it is caught up: it will
return errors and other components will fallback to querying
https://plc.directory. But you will be rate-limited quite hard.
* `make init-db`
* This will add the initial set of PDS hosts into the database.
* You can skip this if you're specifying `CONSUMER_RELAYS` in
`docker-compose.override.yml`
* `make up` * `make up`
## Additional commands ## Usage
* `make status` - will show container status and resource usage You can directly replace `https://plc.directory` with a URL to the exposed port
* `make psql` - starts up SQL shell inside the `postgres` container (11004 by default).
* `make logs` - streams container logs into your terminal
* `make sqltop` - will show you currently running queries
* `make sqldu` - will show disk space usage for each table and index
## Tweaking the number of indexer threads at runtime Note that on the first run it will take quite a few hours to download everything,
and the mirror with respond with 500 if it's not caught up yet.
Record indexer exposes a simple HTTP handler that allows to do this:
`curl -s 'http://localhost:11003/pool/resize?size=10'`
## Advanced topics
### Table partitioning
With partitioning by collection you can have separate indexes for each record
type. Also, doing any kind of heavy processing on a particular record type will
be also faster, because all of these records will be in a separate table and
PostgreSQL will just read them sequentially, instead of checking `collection`
column for each row.
You can do the partitioning at any point, but the more data you already have in
the database, the longer will it take.
Before doing this you need to run `lister` at least once in order to create the
tables (`make init-db` does this for you as well).
* Stop all containers except for `postgres`.
* Run the [SQL script](db-migration/migrations/20240217_partition.sql) in
`psql`.
* Check [`migrations`](db-migration/migrations/) dir for any additional
migrations you might be interested in.
* Once all is done, start the other containers again.

View File

@ -1 +0,0 @@
../.env

View File

@ -1,6 +0,0 @@
#!/bin/sh
cd ..
docker compose exec -i postgres pg_dump -U postgres -d bluesky -t records -t records_id_seq --schema-only | sed -E -e 's/PARTITION BY.*/;/' > records.sql
docker compose exec -i postgres pg_dump -U postgres -d bluesky --table-and-children records --load-via-partition-root --data-only | lz4 > records.sql.lz4

View File

@ -1,14 +0,0 @@
version: '3.8'
services:
postgres:
image: "postgres:16"
volumes:
- "${DATA_DIR:?specify data dir in .env file}/benchmark:/var/lib/postgresql/data:rw"
restart: always
extra_hosts:
- "host.docker.internal:host-gateway"
environment:
POSTGRES_DB: bluesky
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD:?specify password in .env file}"
command: ["-c", "max_connections=1000"]

View File

@ -1,25 +0,0 @@
#!/bin/sh
output="psql_$(date '+%y%m%d_%H%M%S').log"
set -x
docker compose stop postgres
. ./.env
sudo rm -rf ${DATA_DIR:?DATA_DIR not set}/benchmark
echo "$(date): Starting data import..."
docker compose up -d postgres
while ! docker compose exec postgres psql -U postgres -d bluesky -c 'select 1;'; do sleep 1; done
cat ../records.sql | docker compose exec -iT postgres psql -U postgres -d bluesky
lz4cat ../records.sql.lz4 | docker compose exec -iT postgres psql -U postgres -d bluesky
echo "$(date): Data import done"
cat ../db-migration/migrations/20240217_partition.sql \
| docker compose exec -iT postgres psql -U postgres -d bluesky --echo-queries -c '\timing' \
| tee -a "${output}"

View File

@ -1,14 +0,0 @@
FROM golang:1.22.3 as builder
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . ./
RUN go build -trimpath ./cmd/consumer
FROM alpine:latest as certs
RUN apk --update add ca-certificates
FROM debian:stable-slim
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
COPY --from=builder /app/consumer .
ENTRYPOINT ["./consumer"]

View File

@ -1,7 +0,0 @@
*
**/*
!go.mod
!go.sum
!**/*.go
cmd/**
!cmd/consumer

View File

@ -1,622 +0,0 @@
package main
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"math"
"net/http"
"net/url"
"path"
"strings"
"time"
"github.com/cenkalti/backoff/v4"
"github.com/gorilla/websocket"
"github.com/prometheus/client_golang/prometheus"
"github.com/rs/zerolog"
"gorm.io/gorm"
"gorm.io/gorm/clause"
comatproto "github.com/bluesky-social/indigo/api/atproto"
"github.com/bluesky-social/indigo/xrpc"
"github.com/ipld/go-ipld-prime/codec/dagcbor"
"github.com/ipld/go-ipld-prime/datamodel"
"github.com/ipld/go-ipld-prime/node/basicnode"
"github.com/uabluerail/indexer/models"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/repo"
"github.com/uabluerail/indexer/util/fix"
"github.com/uabluerail/indexer/util/resolver"
)
const lastRevUpdateInterval = 24 * time.Hour
type BadRecord struct {
ID models.ID `gorm:"primarykey"`
CreatedAt time.Time
PDS models.ID `gorm:"index"`
Cursor int64
Error string
Content []byte
}
type Consumer struct {
db *gorm.DB
remote pds.PDS
running chan struct{}
lastCursorPersist time.Time
}
func NewConsumer(ctx context.Context, remote *pds.PDS, db *gorm.DB) (*Consumer, error) {
if err := db.AutoMigrate(&BadRecord{}); err != nil {
return nil, fmt.Errorf("db.AutoMigrate: %s", err)
}
return &Consumer{
db: db,
remote: *remote,
running: make(chan struct{}),
}, nil
}
func (c *Consumer) Start(ctx context.Context) error {
go c.run(ctx)
return nil
}
func (c *Consumer) Wait(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-c.running:
// Channel got closed
return nil
}
}
func (c *Consumer) run(ctx context.Context) {
log := zerolog.Ctx(ctx).With().Str("pds", c.remote.Host).Logger()
ctx = log.WithContext(ctx)
backoffTimer := backoff.NewExponentialBackOff(
backoff.WithMaxElapsedTime(0),
backoff.WithInitialInterval(time.Second),
backoff.WithMaxInterval(5*time.Minute),
)
pdsOnline.WithLabelValues(c.remote.Host).Set(0)
defer close(c.running)
for {
select {
case <-c.running:
log.Error().Msgf("Attempt to start previously stopped consumer")
return
case <-ctx.Done():
log.Info().Msgf("Consumer stopped")
lastEventTimestamp.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
eventCounter.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
reposDiscovered.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
postsByLanguageIndexed.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
pdsOnline.DeletePartialMatch(prometheus.Labels{"remote": c.remote.Host})
return
default:
start := time.Now()
if err := c.runOnce(ctx); err != nil {
log.Error().Err(err).Msgf("Consumer of %q failed (will be restarted): %s", c.remote.Host, err)
connectionFailures.WithLabelValues(c.remote.Host).Inc()
}
if time.Since(start) > backoffTimer.MaxInterval*3 {
// XXX: assume that c.runOnce did some useful work in this case,
// even though it might have been stuck on some absurdly long timeouts.
backoffTimer.Reset()
}
time.Sleep(backoffTimer.NextBackOff())
}
}
}
func (c *Consumer) runOnce(ctx context.Context) error {
log := zerolog.Ctx(ctx)
log.Info().
Int64("cursor", c.remote.Cursor).
Int64("first_cursor_since_reset", c.remote.FirstCursorSinceReset).
Msgf("Connecting to firehose of %s...", c.remote.Host)
addr, err := url.Parse(c.remote.Host)
if err != nil {
return fmt.Errorf("parsing URL %q: %s", c.remote.Host, err)
}
addr.Scheme = "wss"
addr.Path = path.Join(addr.Path, "xrpc/com.atproto.sync.subscribeRepos")
if c.remote.Cursor > 0 {
params := url.Values{"cursor": []string{fmt.Sprint(c.remote.Cursor)}}
addr.RawQuery = params.Encode()
}
conn, _, err := websocket.DefaultDialer.DialContext(ctx, addr.String(), http.Header{})
if err != nil {
return fmt.Errorf("establishing websocker connection: %w", err)
}
defer conn.Close()
pdsOnline.WithLabelValues(c.remote.Host).Set(1)
defer func() { pdsOnline.WithLabelValues(c.remote.Host).Set(0) }()
ch := make(chan bool)
defer close(ch)
go func() {
t := time.NewTicker(time.Minute)
defer t.Stop()
for {
select {
case <-ch:
return
case <-t.C:
if err := conn.WriteControl(websocket.PingMessage, []byte("ping"), time.Now().Add(time.Minute)); err != nil {
log.Error().Err(err).Msgf("Failed to send ping: %s", err)
}
}
}
}()
first := true
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
_, b, err := conn.ReadMessage()
if err != nil {
return fmt.Errorf("websocket.ReadMessage: %w", err)
}
r := bytes.NewReader(b)
proto := basicnode.Prototype.Any
headerNode := proto.NewBuilder()
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true}).Decode(headerNode, r); err != nil {
return fmt.Errorf("unmarshaling message header: %w", err)
}
header, err := parseHeader(headerNode.Build())
if err != nil {
return fmt.Errorf("parsing message header: %w", err)
}
switch header.Op {
case 1:
if err := c.processMessage(ctx, header.Type, r, first); err != nil {
if ctx.Err() != nil {
// We're shutting down, so the error is most likely due to that.
return err
}
const maxBadRecords = 500
var count int64
if err2 := c.db.Model(&BadRecord{}).Where(&BadRecord{PDS: c.remote.ID}).Count(&count).Error; err2 != nil {
return err
}
if count >= maxBadRecords {
return err
}
log.Error().Err(err).Str("pds", c.remote.Host).Msgf("Failed to process message at cursor %d: %s", c.remote.Cursor, err)
err := c.db.Create(&BadRecord{
PDS: c.remote.ID,
Cursor: c.remote.Cursor,
Error: err.Error(),
Content: b,
}).Error
if err != nil {
return fmt.Errorf("failed to store bad message: %s", err)
}
}
case -1:
bodyNode := proto.NewBuilder()
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true, AllowLinks: true}).Decode(bodyNode, r); err != nil {
return fmt.Errorf("unmarshaling message body: %w", err)
}
body, err := parseError(bodyNode.Build())
if err != nil {
return fmt.Errorf("parsing error payload: %w", err)
}
return &body
default:
log.Warn().Msgf("Unknown 'op' value received: %d", header.Op)
}
first = false
}
}
}
func (c *Consumer) resetCursor(ctx context.Context, seq int64) error {
zerolog.Ctx(ctx).Warn().Str("pds", c.remote.Host).Msgf("Cursor reset: %d -> %d", c.remote.Cursor, seq)
err := c.db.Model(&c.remote).
Where(&pds.PDS{ID: c.remote.ID}).
Updates(&pds.PDS{FirstCursorSinceReset: seq}).Error
if err != nil {
return fmt.Errorf("updating FirstCursorSinceReset: %w", err)
}
c.remote.FirstCursorSinceReset = seq
return nil
}
func (c *Consumer) updateCursor(ctx context.Context, seq int64) error {
if math.Abs(float64(seq-c.remote.Cursor)) < 100 && time.Since(c.lastCursorPersist) < 5*time.Second {
c.remote.Cursor = seq
return nil
}
err := c.db.Model(&c.remote).
Where(&pds.PDS{ID: c.remote.ID}).
Updates(&pds.PDS{Cursor: seq}).Error
if err != nil {
return fmt.Errorf("updating Cursor: %w", err)
}
c.remote.Cursor = seq
return nil
}
func (c *Consumer) processMessage(ctx context.Context, typ string, r io.Reader, first bool) error {
log := zerolog.Ctx(ctx)
eventCounter.WithLabelValues(c.remote.Host, typ).Inc()
switch typ {
case "#commit":
payload := &comatproto.SyncSubscribeRepos_Commit{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
if c.remote.FirstCursorSinceReset == 0 {
if err := c.resetCursor(ctx, payload.Seq); err != nil {
return fmt.Errorf("handling cursor reset: %w", err)
}
}
repoInfo, created, err := repo.EnsureExists(ctx, c.db, payload.Repo)
if err != nil {
return fmt.Errorf("repo.EnsureExists(%q): %w", payload.Repo, err)
}
if repoInfo.LastKnownKey == "" {
_, pubKey, err := resolver.GetPDSEndpointAndPublicKey(ctx, payload.Repo)
if err != nil {
return fmt.Errorf("failed to get DID doc for %q: %w", payload.Repo, err)
}
repoInfo.LastKnownKey = pubKey
err = c.db.Model(repoInfo).Where(&repo.Repo{ID: repoInfo.ID}).Updates(&repo.Repo{LastKnownKey: pubKey}).Error
if err != nil {
return fmt.Errorf("failed to update the key for %q: %w", payload.Repo, err)
}
}
if repoInfo.PDS != c.remote.ID {
u, _, err := resolver.GetPDSEndpointAndPublicKey(ctx, payload.Repo)
if err == nil {
cur, err := pds.EnsureExists(ctx, c.db, u.String())
if err == nil {
if repoInfo.PDS != cur.ID {
// Repo was migrated, lets update our record.
err := c.db.Model(repoInfo).Where(&repo.Repo{ID: repoInfo.ID}).Updates(&repo.Repo{PDS: cur.ID}).Error
if err != nil {
log.Error().Err(err).Msgf("Repo %q was migrated to %q, but updating the repo has failed: %s", payload.Repo, cur.Host, err)
}
}
repoInfo.PDS = cur.ID
} else {
log.Error().Err(err).Msgf("Failed to get PDS record for %q: %s", u, err)
}
} else {
log.Error().Err(err).Msgf("Failed to get PDS endpoint for repo %q: %s", payload.Repo, err)
}
if repoInfo.PDS != c.remote.ID {
// We checked a recent version of DID doc and this is still not a correct PDS.
log.Error().Str("did", payload.Repo).Str("rev", payload.Rev).
Msgf("Commit from an incorrect PDS, skipping")
return nil
}
}
if created {
reposDiscovered.WithLabelValues(c.remote.Host).Inc()
}
expectRecords := false
deletions := []string{}
for _, op := range payload.Ops {
switch op.Action {
case "create":
expectRecords = true
case "update":
expectRecords = true
case "delete":
deletions = append(deletions, op.Path)
}
}
for _, d := range deletions {
parts := strings.SplitN(d, "/", 2)
if len(parts) != 2 {
continue
}
err := c.db.Model(&repo.Record{}).
Where(&repo.Record{
Repo: models.ID(repoInfo.ID),
Collection: parts[0],
Rkey: parts[1]}).
Updates(&repo.Record{Deleted: true}).Error
if err != nil {
return fmt.Errorf("failed to mark %s/%s as deleted: %w", payload.Repo, d, err)
}
}
newRecs, err := repo.ExtractRecords(ctx, bytes.NewReader(payload.Blocks), repoInfo.LastKnownKey)
if errors.Is(err, repo.ErrInvalidSignature) {
// Key might have been updated recently.
_, pubKey, err2 := resolver.GetPDSEndpointAndPublicKey(ctx, payload.Repo)
if err2 != nil {
return fmt.Errorf("failed to get DID doc for %q: %w", payload.Repo, err2)
}
if repoInfo.LastKnownKey != pubKey {
repoInfo.LastKnownKey = pubKey
err2 = c.db.Model(repoInfo).Where(&repo.Repo{ID: repoInfo.ID}).Updates(&repo.Repo{LastKnownKey: pubKey}).Error
if err2 != nil {
return fmt.Errorf("failed to update the key for %q: %w", payload.Repo, err2)
}
// Retry with the new key.
newRecs, err = repo.ExtractRecords(ctx, bytes.NewReader(payload.Blocks), pubKey)
}
}
if err != nil {
return fmt.Errorf("failed to extract records: %w", err)
}
recs := []repo.Record{}
for k, v := range newRecs {
parts := strings.SplitN(k, "/", 2)
if len(parts) != 2 {
log.Warn().Msgf("Unexpected key format: %q", k)
continue
}
langs, _, err := repo.GetLang(ctx, v)
if err == nil {
for _, lang := range langs {
postsByLanguageIndexed.WithLabelValues(c.remote.Host, lang).Inc()
}
}
recs = append(recs, repo.Record{
Repo: models.ID(repoInfo.ID),
Collection: parts[0],
Rkey: parts[1],
// XXX: proper replacement of \u0000 would require full parsing of JSON
// and recursive iteration over all string values, but this
// should work well enough for now.
Content: fix.EscapeNullCharForPostgres(v),
AtRev: payload.Rev,
})
}
if len(recs) == 0 && expectRecords {
log.Debug().Int64("seq", payload.Seq).Str("pds", c.remote.Host).Msgf("len(recs) == 0")
}
if len(recs) > 0 {
err = c.db.Model(&repo.Record{}).
Clauses(clause.OnConflict{
Where: clause.Where{Exprs: []clause.Expression{clause.Or(
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: nil},
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: ""},
clause.Lt{
Column: clause.Column{Name: "at_rev", Table: "records"},
Value: clause.Column{Name: "at_rev", Table: "excluded"}},
)}},
DoUpdates: clause.AssignmentColumns([]string{"content", "at_rev"}),
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
Create(recs).Error
if err != nil {
return fmt.Errorf("inserting records into the database: %w", err)
}
}
if repoInfo.FirstCursorSinceReset > 0 && repoInfo.FirstRevSinceReset != "" &&
repoInfo.LastIndexedRev != "" &&
c.remote.FirstCursorSinceReset > 0 &&
repoInfo.FirstCursorSinceReset >= c.remote.FirstCursorSinceReset &&
repoInfo.FirstRevSinceReset <= repoInfo.LastIndexedRev &&
time.Since(repoInfo.UpdatedAt) > lastRevUpdateInterval {
err = c.db.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoInfo.ID}).
Updates(&repo.Repo{
LastFirehoseRev: payload.Rev,
}).Error
if err != nil {
log.Error().Err(err).Msgf("Failed to update last_firehose_rev for %q: %s", repoInfo.DID, err)
}
}
if payload.TooBig {
// Just trigger a re-index by resetting rev.
err := c.db.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoInfo.ID}).
Updates(&repo.Repo{
FirstCursorSinceReset: c.remote.FirstCursorSinceReset,
FirstRevSinceReset: payload.Rev,
}).Error
if err != nil {
return fmt.Errorf("failed to update repo info after cursor reset: %w", err)
}
}
if repoInfo.FirstCursorSinceReset != c.remote.FirstCursorSinceReset {
err := c.db.Model(&repo.Repo{}).Debug().Where(&repo.Repo{ID: repoInfo.ID}).
Updates(&repo.Repo{
FirstCursorSinceReset: c.remote.FirstCursorSinceReset,
FirstRevSinceReset: payload.Rev,
}).Error
if err != nil {
return fmt.Errorf("failed to update repo info after cursor reset: %w", err)
}
}
if err := c.updateCursor(ctx, payload.Seq); err != nil {
return err
}
case "#handle":
payload := &comatproto.SyncSubscribeRepos_Handle{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
if c.remote.FirstCursorSinceReset == 0 {
if err := c.resetCursor(ctx, payload.Seq); err != nil {
return fmt.Errorf("handling cursor reset: %w", err)
}
}
// No-op, we don't store handles.
if err := c.updateCursor(ctx, payload.Seq); err != nil {
return err
}
case "#migrate":
payload := &comatproto.SyncSubscribeRepos_Migrate{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
if c.remote.FirstCursorSinceReset == 0 {
if err := c.resetCursor(ctx, payload.Seq); err != nil {
return fmt.Errorf("handling cursor reset: %w", err)
}
}
log.Debug().Interface("payload", payload).Str("did", payload.Did).Msgf("MIGRATION")
// TODO
if err := c.updateCursor(ctx, payload.Seq); err != nil {
return err
}
case "#tombstone":
payload := &comatproto.SyncSubscribeRepos_Tombstone{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
if c.remote.FirstCursorSinceReset == 0 {
if err := c.resetCursor(ctx, payload.Seq); err != nil {
return fmt.Errorf("handling cursor reset: %w", err)
}
}
// TODO
if err := c.updateCursor(ctx, payload.Seq); err != nil {
return err
}
case "#info":
payload := &comatproto.SyncSubscribeRepos_Info{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
switch payload.Name {
case "OutdatedCursor":
if !first {
log.Warn().Msgf("Received cursor reset notification in the middle of a stream: %+v", payload)
}
c.remote.FirstCursorSinceReset = 0
default:
log.Error().Msgf("Unknown #info message %q: %+v", payload.Name, payload)
}
case "#identity":
payload := &comatproto.SyncSubscribeRepos_Identity{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
exportEventTimestamp(ctx, c.remote.Host, payload.Time)
log.Trace().Str("did", payload.Did).Str("type", typ).Int64("seq", payload.Seq).
Msgf("#identity message: %s seq=%d time=%q", payload.Did, payload.Seq, payload.Time)
resolver.Resolver.FlushCacheFor(payload.Did)
// TODO: fetch DID doc and update PDS field?
default:
b, err := io.ReadAll(r)
if err != nil {
log.Error().Err(err).Msgf("Failed to read message payload: %s", err)
}
log.Warn().Msgf("Unknown message type received: %s payload=%q", typ, string(b))
}
return nil
}
type Header struct {
Op int64
Type string
}
func parseHeader(node datamodel.Node) (Header, error) {
r := Header{}
op, err := node.LookupByString("op")
if err != nil {
return r, fmt.Errorf("missing 'op': %w", err)
}
r.Op, err = op.AsInt()
if err != nil {
return r, fmt.Errorf("op.AsInt(): %w", err)
}
if r.Op == -1 {
// Error frame, type should not be present
return r, nil
}
t, err := node.LookupByString("t")
if err != nil {
return r, fmt.Errorf("missing 't': %w", err)
}
r.Type, err = t.AsString()
if err != nil {
return r, fmt.Errorf("t.AsString(): %w", err)
}
return r, nil
}
func parseError(node datamodel.Node) (xrpc.XRPCError, error) {
r := xrpc.XRPCError{}
e, err := node.LookupByString("error")
if err != nil {
return r, fmt.Errorf("missing 'error': %w", err)
}
r.ErrStr, err = e.AsString()
if err != nil {
return r, fmt.Errorf("error.AsString(): %w", err)
}
m, err := node.LookupByString("message")
if err == nil {
r.Message, err = m.AsString()
if err != nil {
return r, fmt.Errorf("message.AsString(): %w", err)
}
} else if !errors.Is(err, datamodel.ErrNotExists{}) {
return r, fmt.Errorf("looking up 'message': %w", err)
}
return r, nil
}
func exportEventTimestamp(ctx context.Context, remote string, timestamp string) {
if t, err := time.Parse(time.RFC3339, timestamp); err != nil {
zerolog.Ctx(ctx).Error().Err(err).Str("pds", remote).Msgf("Failed to parse %q as a timestamp: %s", timestamp, err)
} else {
lastEventTimestamp.WithLabelValues(remote).Set(float64(t.Unix()))
}
}

View File

@ -1,268 +0,0 @@
package main
import (
"context"
"flag"
"fmt"
"io"
"log"
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
"runtime/debug"
"strings"
"sync"
"syscall"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/kelseyhightower/envconfig"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/rs/zerolog"
"gorm.io/driver/postgres"
"gorm.io/gorm"
"gorm.io/gorm/logger"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/util/gormzerolog"
)
type Config struct {
LogFile string
LogFormat string `default:"text"`
LogLevel int64 `default:"1"`
MetricsPort string `split_words:"true"`
DBUrl string `envconfig:"POSTGRES_URL"`
Relays string
}
var config Config
func runMain(ctx context.Context) error {
ctx = setupLogging(ctx)
log := zerolog.Ctx(ctx)
log.Debug().Msgf("Starting up...")
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
Logger: gormzerolog.New(&logger.Config{
SlowThreshold: 3 * time.Second,
IgnoreRecordNotFoundError: true,
}, nil),
})
if err != nil {
return fmt.Errorf("connecting to the database: %w", err)
}
log.Debug().Msgf("DB connection established")
if config.Relays != "" {
for _, host := range strings.Split(config.Relays, ",") {
c, err := NewRelayConsumer(ctx, host, db)
if err != nil {
log.Error().Err(err).Msgf("Failed to create relay consumer for %q: %s", host, err)
}
c.Start(ctx)
}
}
consumersCh := make(chan struct{})
go runConsumers(ctx, db, consumersCh)
log.Info().Msgf("Starting HTTP listener on %q...", config.MetricsPort)
http.Handle("/metrics", promhttp.Handler())
srv := &http.Server{Addr: fmt.Sprintf(":%s", config.MetricsPort)}
errCh := make(chan error)
go func() {
errCh <- srv.ListenAndServe()
}()
select {
case <-ctx.Done():
if err := srv.Shutdown(context.Background()); err != nil {
return fmt.Errorf("HTTP server shutdown failed: %w", err)
}
}
log.Info().Msgf("Waiting for consumers to stop...")
<-consumersCh
return <-errCh
}
func runConsumers(ctx context.Context, db *gorm.DB, doneCh chan struct{}) {
log := zerolog.Ctx(ctx)
defer close(doneCh)
type consumerHandle struct {
cancel context.CancelFunc
consumer *Consumer
}
running := map[string]consumerHandle{}
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
t := make(chan time.Time, 1)
t <- time.Now()
for {
select {
case <-t:
remotes := []pds.PDS{}
if err := db.Find(&remotes).Error; err != nil {
log.Error().Err(err).Msgf("Failed to get a list of known PDSs: %s", err)
break
}
shouldBeRunning := map[string]pds.PDS{}
for _, remote := range remotes {
if remote.Disabled {
continue
}
shouldBeRunning[remote.Host] = remote
}
for host, handle := range running {
if _, found := shouldBeRunning[host]; found {
continue
}
handle.cancel()
_ = handle.consumer.Wait(ctx)
delete(running, host)
}
for host, remote := range shouldBeRunning {
if _, found := running[host]; found {
continue
}
subCtx, cancel := context.WithCancel(ctx)
c, err := NewConsumer(subCtx, &remote, db)
if err != nil {
log.Error().Err(err).Msgf("Failed to create a consumer for %q: %s", remote.Host, err)
cancel()
continue
}
if err := c.Start(subCtx); err != nil {
log.Error().Err(err).Msgf("Failed ot start a consumer for %q: %s", remote.Host, err)
cancel()
continue
}
running[host] = consumerHandle{
cancel: cancel,
consumer: c,
}
}
case <-ctx.Done():
var wg sync.WaitGroup
for host, handle := range running {
wg.Add(1)
go func(handle consumerHandle) {
handle.cancel()
_ = handle.consumer.Wait(ctx)
wg.Done()
}(handle)
delete(running, host)
}
wg.Wait()
case v := <-ticker.C:
// Non-blocking send.
select {
case t <- v:
default:
}
}
}
}
func main() {
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
flag.StringVar(&config.Relays, "relays", "", "List of relays to connect to (for discovering new PDSs)")
if err := envconfig.Process("consumer", &config); err != nil {
log.Fatalf("envconfig.Process: %s", err)
}
flag.Parse()
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
if err := runMain(ctx); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func setupLogging(ctx context.Context) context.Context {
logFile := os.Stderr
if config.LogFile != "" {
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
}
logFile = f
}
var output io.Writer
switch config.LogFormat {
case "json":
output = logFile
case "text":
prefixList := []string{}
info, ok := debug.ReadBuildInfo()
if ok {
prefixList = append(prefixList, info.Path+"/")
}
basedir := ""
_, sourceFile, _, ok := runtime.Caller(0)
if ok {
basedir = filepath.Dir(sourceFile)
}
if basedir != "" && strings.HasPrefix(basedir, "/") {
prefixList = append(prefixList, basedir+"/")
head, _ := filepath.Split(basedir)
for head != "/" {
prefixList = append(prefixList, head)
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
}
}
output = zerolog.ConsoleWriter{
Out: logFile,
NoColor: true,
TimeFormat: time.RFC3339,
PartsOrder: []string{
zerolog.LevelFieldName,
zerolog.TimestampFieldName,
zerolog.CallerFieldName,
zerolog.MessageFieldName,
},
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
FormatCaller: func(i interface{}) string {
s := i.(string)
for _, p := range prefixList {
s = strings.TrimPrefix(s, p)
}
return s
},
}
default:
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
}
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
ctx = logger.WithContext(ctx)
zerolog.DefaultContextLogger = &logger
log.SetOutput(logger)
return ctx
}

View File

@ -1,36 +0,0 @@
package main
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var lastEventTimestamp = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "repo_commit_received_timestamp",
Help: "Timestamp of the last event received from firehose.",
}, []string{"remote"})
var eventCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "repo_commits_received_counter",
Help: "Counter of events received from each remote.",
}, []string{"remote", "type"})
var reposDiscovered = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "repo_discovered_counter",
Help: "Counter of newly discovered repos",
}, []string{"remote"})
var postsByLanguageIndexed = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "indexer_posts_by_language_count",
Help: "Number of posts by language",
}, []string{"remote", "lang"})
var connectionFailures = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "consumer_connection_failures",
Help: "Counter of firehose connection failures",
}, []string{"remote"})
var pdsOnline = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "consumer_connection_up",
Help: "Status of a connection. 1 - up and running.",
}, []string{"remote"})

View File

@ -1,179 +0,0 @@
package main
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/url"
"path"
"time"
comatproto "github.com/bluesky-social/indigo/api/atproto"
"github.com/gorilla/websocket"
"github.com/ipld/go-ipld-prime/codec/dagcbor"
"github.com/ipld/go-ipld-prime/node/basicnode"
"github.com/rs/zerolog"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/util/resolver"
"gorm.io/gorm"
)
type RelayConsumer struct {
url string
db *gorm.DB
}
func NewRelayConsumer(ctx context.Context, host string, db *gorm.DB) (*RelayConsumer, error) {
addr, err := url.Parse(host)
if err != nil {
return nil, fmt.Errorf("parsing URL %q: %s", host, err)
}
addr.Scheme = "wss"
addr.Path = path.Join(addr.Path, "xrpc/com.atproto.sync.subscribeRepos")
return &RelayConsumer{db: db, url: addr.String()}, nil
}
func (c *RelayConsumer) Start(ctx context.Context) {
go c.run(ctx)
}
func (c *RelayConsumer) run(ctx context.Context) {
log := zerolog.Ctx(ctx).With().Str("relay", c.url).Logger()
ctx = log.WithContext(ctx)
for {
select {
case <-ctx.Done():
log.Info().Msgf("Relay consumer stopped")
return
default:
if err := c.runOnce(ctx); err != nil {
log.Error().Err(err).Msgf("Consumer of relay %q failed (will be restarted): %s", c.url, err)
}
time.Sleep(time.Second)
}
}
}
func (c *RelayConsumer) runOnce(ctx context.Context) error {
log := zerolog.Ctx(ctx)
conn, _, err := websocket.DefaultDialer.DialContext(ctx, c.url, http.Header{})
if err != nil {
return fmt.Errorf("establishing websocker connection: %w", err)
}
defer conn.Close()
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
_, b, err := conn.ReadMessage()
if err != nil {
return fmt.Errorf("websocket.ReadMessage: %w", err)
}
r := bytes.NewReader(b)
proto := basicnode.Prototype.Any
headerNode := proto.NewBuilder()
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true}).Decode(headerNode, r); err != nil {
return fmt.Errorf("unmarshaling message header: %w", err)
}
header, err := parseHeader(headerNode.Build())
if err != nil {
return fmt.Errorf("parsing message header: %w", err)
}
switch header.Op {
case 1:
if err := c.processMessage(ctx, header.Type, r); err != nil {
log.Info().Err(err).Msgf("Relay consumer failed to process a message: %s", err)
}
case -1:
bodyNode := proto.NewBuilder()
if err := (&dagcbor.DecodeOptions{DontParseBeyondEnd: true, AllowLinks: true}).Decode(bodyNode, r); err != nil {
return fmt.Errorf("unmarshaling message body: %w", err)
}
body, err := parseError(bodyNode.Build())
if err != nil {
return fmt.Errorf("parsing error payload: %w", err)
}
return &body
default:
log.Warn().Msgf("Unknown 'op' value received: %d", header.Op)
}
}
}
}
func (c *RelayConsumer) processMessage(ctx context.Context, typ string, r io.Reader) error {
log := zerolog.Ctx(ctx)
did := ""
switch typ {
case "#commit":
payload := &comatproto.SyncSubscribeRepos_Commit{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
did = payload.Repo
case "#handle":
payload := &comatproto.SyncSubscribeRepos_Handle{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
did = payload.Did
case "#migrate":
payload := &comatproto.SyncSubscribeRepos_Migrate{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
did = payload.Did
case "#tombstone":
payload := &comatproto.SyncSubscribeRepos_Tombstone{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
did = payload.Did
case "#info":
// Ignore
case "#identity":
payload := &comatproto.SyncSubscribeRepos_Identity{}
if err := payload.UnmarshalCBOR(r); err != nil {
return fmt.Errorf("failed to unmarshal commit: %w", err)
}
did = payload.Did
default:
b, err := io.ReadAll(r)
if err != nil {
log.Error().Err(err).Msgf("Failed to read message payload: %s", err)
}
log.Warn().Msgf("Unknown message type received: %s payload=%q", typ, string(b))
}
if did == "" {
return nil
}
u, _, err := resolver.GetPDSEndpointAndPublicKey(ctx, did)
if err != nil {
return err
}
_, err = pds.EnsureExists(ctx, c.db, u.String())
return err
}

View File

@ -1,14 +0,0 @@
FROM golang:1.22.3 as builder
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . ./
RUN go build -trimpath ./cmd/lister
FROM alpine:latest as certs
RUN apk --update add ca-certificates
FROM debian:stable-slim
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
COPY --from=builder /app/lister .
ENTRYPOINT ["./lister"]

View File

@ -1,7 +0,0 @@
*
**/*
!go.mod
!go.sum
!**/*.go
cmd/**
!cmd/lister

View File

@ -1,151 +0,0 @@
package main
import (
"context"
"errors"
"time"
"github.com/rs/zerolog"
"gorm.io/gorm"
comatproto "github.com/bluesky-social/indigo/api/atproto"
"github.com/bluesky-social/indigo/did"
"github.com/uabluerail/bsky-tools/pagination"
"github.com/uabluerail/bsky-tools/xrpcauth"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/repo"
"github.com/uabluerail/indexer/util/resolver"
)
type Lister struct {
db *gorm.DB
resolver did.Resolver
pollInterval time.Duration
listRefreshInterval time.Duration
}
func NewLister(ctx context.Context, db *gorm.DB) (*Lister, error) {
return &Lister{
db: db,
resolver: resolver.Resolver,
pollInterval: 5 * time.Minute,
listRefreshInterval: 24 * time.Hour,
}, nil
}
func (l *Lister) Start(ctx context.Context) error {
go l.run(ctx)
return nil
}
func (l *Lister) run(ctx context.Context) {
log := zerolog.Ctx(ctx)
ticker := time.NewTicker(l.pollInterval)
log.Info().Msgf("Lister starting...")
t := make(chan time.Time, 1)
t <- time.Now()
for {
select {
case <-ctx.Done():
log.Info().Msgf("Lister stopped (context expired)")
return
case <-t:
db := l.db.WithContext(ctx)
remote := pds.PDS{}
if err := db.Model(&remote).
Where("(disabled=false or disabled is null) and (last_list is null or last_list < ?)", time.Now().Add(-l.listRefreshInterval)).
Take(&remote).Error; err != nil {
if !errors.Is(err, gorm.ErrRecordNotFound) {
log.Error().Err(err).Msgf("Failed to query DB for a PDS to list repos from: %s", err)
}
break
}
if !pds.IsWhitelisted(remote.Host) {
log.Info().Msgf("PDS %q is not whitelisted, disabling it", remote.Host)
if err := db.Model(&remote).Where(&pds.PDS{ID: remote.ID}).Updates(&pds.PDS{Disabled: true}).Error; err != nil {
log.Error().Err(err).Msgf("Failed to disable PDS %q: %s", remote.Host, err)
}
break
}
client := xrpcauth.NewAnonymousClient(ctx)
client.Host = remote.Host
log.Info().Msgf("Listing repos from %q...", remote.Host)
repos, err := pagination.Reduce(
func(cursor string) (resp *comatproto.SyncListRepos_Output, nextCursor string, err error) {
resp, err = comatproto.SyncListRepos(ctx, client, cursor, 200)
if err == nil && resp.Cursor != nil {
nextCursor = *resp.Cursor
}
return
},
func(resp *comatproto.SyncListRepos_Output, acc []*comatproto.SyncListRepos_Repo) ([]*comatproto.SyncListRepos_Repo, error) {
for _, repo := range resp.Repos {
if repo == nil {
continue
}
acc = append(acc, repo)
}
return acc, nil
})
if err != nil {
log.Error().Err(err).Msgf("Failed to list repos from %q: %s", remote.Host, err)
// Update the timestamp so we don't get stuck on a single broken PDS
if err := db.Model(&remote).Updates(&pds.PDS{LastList: time.Now()}).Error; err != nil {
log.Error().Err(err).Msgf("Failed to update the timestamp of last list for %q: %s", remote.Host, err)
}
break
}
log.Info().Msgf("Received %d DIDs from %q", len(repos), remote.Host)
reposListed.WithLabelValues(remote.Host).Add(float64(len(repos)))
for _, repoInfo := range repos {
record, created, err := repo.EnsureExists(ctx, l.db, repoInfo.Did)
if err != nil {
log.Error().Err(err).Msgf("Failed to ensure that we have a record for the repo %q: %s", repoInfo.Did, err)
} else if created {
reposDiscovered.WithLabelValues(remote.Host).Inc()
}
if err == nil && record.FirstRevSinceReset == "" {
// Populate this field in case it's empty, so we don't have to wait for the first firehose event
// to trigger a resync.
err := l.db.Transaction(func(tx *gorm.DB) error {
var currentRecord repo.Repo
if err := tx.Model(&record).Where(&repo.Repo{ID: record.ID}).Take(&currentRecord).Error; err != nil {
return err
}
if currentRecord.FirstRevSinceReset != "" {
// Someone else already updated it, nothing to do.
return nil
}
var remote pds.PDS
if err := tx.Model(&remote).Where(&pds.PDS{ID: record.PDS}).Take(&remote).Error; err != nil {
return err
}
return tx.Model(&record).Where(&repo.Repo{ID: record.ID}).Updates(&repo.Repo{
FirstRevSinceReset: repoInfo.Rev,
FirstCursorSinceReset: remote.FirstCursorSinceReset,
}).Error
})
if err != nil {
log.Error().Err(err).Msgf("Failed to set the initial FirstRevSinceReset value for %q: %s", repoInfo.Did, err)
}
}
}
if err := db.Model(&remote).Updates(&pds.PDS{LastList: time.Now()}).Error; err != nil {
log.Error().Err(err).Msgf("Failed to update the timestamp of last list for %q: %s", remote.Host, err)
}
case v := <-ticker.C:
t <- v
}
}
}

View File

@ -1,168 +0,0 @@
package main
import (
"context"
"flag"
"fmt"
"io"
"log"
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
"runtime/debug"
"strings"
"syscall"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/kelseyhightower/envconfig"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/rs/zerolog"
"gorm.io/driver/postgres"
"gorm.io/gorm"
"gorm.io/gorm/logger"
"github.com/uabluerail/indexer/util/gormzerolog"
)
type Config struct {
LogFile string
LogFormat string `default:"text"`
LogLevel int64 `default:"1"`
MetricsPort string `split_words:"true"`
DBUrl string `envconfig:"POSTGRES_URL"`
}
var config Config
func runMain(ctx context.Context) error {
ctx = setupLogging(ctx)
log := zerolog.Ctx(ctx)
log.Debug().Msgf("Starting up...")
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
Logger: gormzerolog.New(&logger.Config{
SlowThreshold: 1 * time.Second,
IgnoreRecordNotFoundError: true,
}, nil),
})
if err != nil {
return fmt.Errorf("connecting to the database: %w", err)
}
log.Debug().Msgf("DB connection established")
lister, err := NewLister(ctx, db)
if err != nil {
return fmt.Errorf("failed to create lister: %w", err)
}
if err := lister.Start(ctx); err != nil {
return fmt.Errorf("failed to start lister: %w", err)
}
log.Info().Msgf("Starting HTTP listener on %q...", config.MetricsPort)
http.Handle("/metrics", promhttp.Handler())
srv := &http.Server{Addr: fmt.Sprintf(":%s", config.MetricsPort)}
errCh := make(chan error)
go func() {
errCh <- srv.ListenAndServe()
}()
select {
case <-ctx.Done():
if err := srv.Shutdown(context.Background()); err != nil {
return fmt.Errorf("HTTP server shutdown failed: %w", err)
}
}
return <-errCh
}
func main() {
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
if err := envconfig.Process("lister", &config); err != nil {
log.Fatalf("envconfig.Process: %s", err)
}
flag.Parse()
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
if err := runMain(ctx); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func setupLogging(ctx context.Context) context.Context {
logFile := os.Stderr
if config.LogFile != "" {
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
}
logFile = f
}
var output io.Writer
switch config.LogFormat {
case "json":
output = logFile
case "text":
prefixList := []string{}
info, ok := debug.ReadBuildInfo()
if ok {
prefixList = append(prefixList, info.Path+"/")
}
basedir := ""
_, sourceFile, _, ok := runtime.Caller(0)
if ok {
basedir = filepath.Dir(sourceFile)
}
if basedir != "" && strings.HasPrefix(basedir, "/") {
prefixList = append(prefixList, basedir+"/")
head, _ := filepath.Split(basedir)
for head != "/" {
prefixList = append(prefixList, head)
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
}
}
output = zerolog.ConsoleWriter{
Out: logFile,
NoColor: true,
TimeFormat: time.RFC3339,
PartsOrder: []string{
zerolog.LevelFieldName,
zerolog.TimestampFieldName,
zerolog.CallerFieldName,
zerolog.MessageFieldName,
},
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
FormatCaller: func(i interface{}) string {
s := i.(string)
for _, p := range prefixList {
s = strings.TrimPrefix(s, p)
}
return s
},
}
default:
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
}
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
ctx = logger.WithContext(ctx)
zerolog.DefaultContextLogger = &logger
log.SetOutput(logger)
return ctx
}

View File

@ -1,16 +0,0 @@
package main
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var reposDiscovered = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "repo_discovered_counter",
Help: "Counter of newly discovered repos",
}, []string{"remote"})
var reposListed = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "repo_listed_counter",
Help: "Counter of repos received by listing PDSs.",
}, []string{"remote"})

View File

@ -1,14 +0,0 @@
FROM golang:1.22.3 as builder
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . ./
RUN go build -trimpath ./cmd/record-indexer
FROM alpine:latest as certs
RUN apk --update add ca-certificates
FROM debian:stable-slim
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
COPY --from=builder /app/record-indexer .
ENTRYPOINT ["./record-indexer"]

View File

@ -1,7 +0,0 @@
*
**/*
!go.mod
!go.sum
!**/*.go
cmd/**
!cmd/record-indexer

View File

@ -1,77 +0,0 @@
package main
import (
"context"
"fmt"
"net/http"
"strconv"
"golang.org/x/time/rate"
)
func AddAdminHandlers(limiter *Limiter, pool *WorkerPool) {
http.HandleFunc("/rate/set", handleRateSet(limiter))
http.HandleFunc("/rate/setAll", handleRateSetAll(limiter))
http.HandleFunc("/pool/resize", handlePoolResize(pool))
}
func handlePoolResize(pool *WorkerPool) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
s := r.FormValue("size")
if s == "" {
http.Error(w, "need size", http.StatusBadRequest)
return
}
size, err := strconv.Atoi(s)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
pool.Resize(context.Background(), size)
fmt.Fprintln(w, "OK")
}
}
func handleRateSet(limiter *Limiter) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
s := r.FormValue("limit")
if s == "" {
http.Error(w, "need limit", http.StatusBadRequest)
return
}
name := r.FormValue("name")
if name == "" {
http.Error(w, "need name", http.StatusBadRequest)
return
}
limit, err := strconv.Atoi(s)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
limiter.SetLimit(context.Background(), name, rate.Limit(limit))
fmt.Fprintln(w, "OK")
}
}
func handleRateSetAll(limiter *Limiter) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
s := r.FormValue("limit")
if s == "" {
http.Error(w, "need limit", http.StatusBadRequest)
return
}
limit, err := strconv.Atoi(s)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
limiter.SetAllLimits(context.Background(), rate.Limit(limit))
fmt.Fprintln(w, "OK")
}
}

View File

@ -1,179 +0,0 @@
package main
import (
"context"
"flag"
"fmt"
"io"
"log"
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
"runtime/debug"
"strings"
"syscall"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/kelseyhightower/envconfig"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/rs/zerolog"
"gorm.io/driver/postgres"
"gorm.io/gorm"
"gorm.io/gorm/logger"
"github.com/uabluerail/indexer/util/gormzerolog"
)
type Config struct {
LogFile string
LogFormat string `default:"text"`
LogLevel int64 `default:"1"`
MetricsPort string `split_words:"true"`
DBUrl string `envconfig:"POSTGRES_URL"`
Workers int `default:"2"`
}
var config Config
func runMain(ctx context.Context) error {
ctx = setupLogging(ctx)
log := zerolog.Ctx(ctx)
log.Debug().Msgf("Starting up...")
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
Logger: gormzerolog.New(&logger.Config{
SlowThreshold: 3 * time.Second,
IgnoreRecordNotFoundError: true,
}, nil),
})
if err != nil {
return fmt.Errorf("connecting to the database: %w", err)
}
log.Debug().Msgf("DB connection established")
limiter, err := NewLimiter(db)
if err != nil {
return fmt.Errorf("failed to create limiter: %w", err)
}
ch := make(chan WorkItem)
pool := NewWorkerPool(ch, db, config.Workers, limiter)
if err := pool.Start(ctx); err != nil {
return fmt.Errorf("failed to start worker pool: %w", err)
}
scheduler := NewScheduler(ch, db)
if err := scheduler.Start(ctx); err != nil {
return fmt.Errorf("failed to start scheduler: %w", err)
}
log.Info().Msgf("Starting HTTP listener on %q...", config.MetricsPort)
AddAdminHandlers(limiter, pool)
http.Handle("/metrics", promhttp.Handler())
srv := &http.Server{Addr: fmt.Sprintf(":%s", config.MetricsPort)}
errCh := make(chan error)
go func() {
errCh <- srv.ListenAndServe()
}()
select {
case <-ctx.Done():
if err := srv.Shutdown(context.Background()); err != nil {
return fmt.Errorf("HTTP server shutdown failed: %w", err)
}
}
return <-errCh
}
func main() {
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
flag.IntVar(&config.Workers, "workers", 2, "Number of workers to start with")
if err := envconfig.Process("indexer", &config); err != nil {
log.Fatalf("envconfig.Process: %s", err)
}
flag.Parse()
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
if err := runMain(ctx); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func setupLogging(ctx context.Context) context.Context {
logFile := os.Stderr
if config.LogFile != "" {
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
}
logFile = f
}
var output io.Writer
switch config.LogFormat {
case "json":
output = logFile
case "text":
prefixList := []string{}
info, ok := debug.ReadBuildInfo()
if ok {
prefixList = append(prefixList, info.Path+"/")
}
basedir := ""
_, sourceFile, _, ok := runtime.Caller(0)
if ok {
basedir = filepath.Dir(sourceFile)
}
if basedir != "" && strings.HasPrefix(basedir, "/") {
prefixList = append(prefixList, basedir+"/")
head, _ := filepath.Split(basedir)
for head != "/" {
prefixList = append(prefixList, head)
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
}
}
output = zerolog.ConsoleWriter{
Out: logFile,
NoColor: true,
TimeFormat: time.RFC3339,
PartsOrder: []string{
zerolog.LevelFieldName,
zerolog.TimestampFieldName,
zerolog.CallerFieldName,
zerolog.MessageFieldName,
},
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
FormatCaller: func(i interface{}) string {
s := i.(string)
for _, p := range prefixList {
s = strings.TrimPrefix(s, p)
}
return s
},
}
default:
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
}
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
ctx = logger.WithContext(ctx)
zerolog.DefaultContextLogger = &logger
log.SetOutput(logger)
return ctx
}

View File

@ -1,41 +0,0 @@
package main
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var reposQueued = promauto.NewCounter(prometheus.CounterOpts{
Name: "indexer_repos_queued_count",
Help: "Number of repos added to the queue",
})
var queueLenght = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "indexer_queue_length",
Help: "Current length of indexing queue",
}, []string{"state"})
var reposFetched = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "indexer_repos_fetched_count",
Help: "Number of repos fetched",
}, []string{"remote", "success"})
var reposIndexed = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "indexer_repos_indexed_count",
Help: "Number of repos indexed",
}, []string{"success"})
var recordsFetched = promauto.NewCounter(prometheus.CounterOpts{
Name: "indexer_records_fetched_count",
Help: "Number of records fetched",
})
var recordsInserted = promauto.NewCounter(prometheus.CounterOpts{
Name: "indexer_records_inserted_count",
Help: "Number of records inserted into DB",
})
var workerPoolSize = promauto.NewGauge(prometheus.GaugeOpts{
Name: "indexer_workers_count",
Help: "Current number of workers running",
})

View File

@ -1,82 +0,0 @@
package main
import (
"context"
"fmt"
"sync"
"github.com/rs/zerolog"
"github.com/uabluerail/indexer/pds"
"golang.org/x/time/rate"
"gorm.io/gorm"
)
const defaultRateLimit = 10
type Limiter struct {
mu sync.RWMutex
db *gorm.DB
limiter map[string]*rate.Limiter
}
func NewLimiter(db *gorm.DB) (*Limiter, error) {
remotes := []pds.PDS{}
if err := db.Find(&remotes).Error; err != nil {
return nil, fmt.Errorf("failed to get the list of known PDSs: %w", err)
}
l := &Limiter{
db: db,
limiter: map[string]*rate.Limiter{},
}
for _, remote := range remotes {
limit := remote.CrawlLimit
if limit == 0 {
limit = defaultRateLimit
}
l.limiter[remote.Host] = rate.NewLimiter(rate.Limit(limit), limit*2)
}
return l, nil
}
func (l *Limiter) getLimiter(name string) *rate.Limiter {
l.mu.RLock()
limiter := l.limiter[name]
l.mu.RUnlock()
if limiter != nil {
return limiter
}
limiter = rate.NewLimiter(defaultRateLimit, defaultRateLimit*2)
l.mu.Lock()
l.limiter[name] = limiter
l.mu.Unlock()
return limiter
}
func (l *Limiter) Wait(ctx context.Context, name string) error {
return l.getLimiter(name).Wait(ctx)
}
func (l *Limiter) SetLimit(ctx context.Context, name string, limit rate.Limit) {
l.getLimiter(name).SetLimit(limit)
err := l.db.Model(&pds.PDS{}).Where(&pds.PDS{Host: name}).Updates(&pds.PDS{CrawlLimit: int(limit)}).Error
if err != nil {
zerolog.Ctx(ctx).Error().Err(err).Msgf("Failed to persist rate limit change for %q: %s", name, err)
}
}
func (l *Limiter) SetAllLimits(ctx context.Context, limit rate.Limit) {
l.mu.RLock()
for name, limiter := range l.limiter {
limiter.SetLimit(limit)
err := l.db.Model(&pds.PDS{}).Where(&pds.PDS{Host: name}).Updates(&pds.PDS{CrawlLimit: int(limit)}).Error
if err != nil {
zerolog.Ctx(ctx).Error().Err(err).Msgf("Failed to persist rate limit change for %q: %s", name, err)
}
}
l.mu.RUnlock()
}

View File

@ -1,158 +0,0 @@
package main
import (
"context"
"fmt"
"slices"
"time"
"github.com/rs/zerolog"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/repo"
"gorm.io/gorm"
)
type Scheduler struct {
db *gorm.DB
output chan<- WorkItem
queue map[string]*repo.Repo
inProgress map[string]*repo.Repo
}
func NewScheduler(output chan<- WorkItem, db *gorm.DB) *Scheduler {
return &Scheduler{
db: db,
output: output,
queue: map[string]*repo.Repo{},
inProgress: map[string]*repo.Repo{},
}
}
func (s *Scheduler) Start(ctx context.Context) error {
go s.run(ctx)
return nil
}
func (s *Scheduler) run(ctx context.Context) {
log := zerolog.Ctx(ctx)
t := time.NewTicker(time.Minute)
defer t.Stop()
if err := s.fillQueue(ctx); err != nil {
log.Error().Err(err).Msgf("Failed to get more tasks for the queue: %s", err)
}
done := make(chan string)
for {
if len(s.queue) > 0 {
next := WorkItem{signal: make(chan struct{})}
for _, r := range s.queue {
next.Repo = r
break
}
select {
case <-ctx.Done():
return
case <-t.C:
if err := s.fillQueue(ctx); err != nil {
log.Error().Err(err).Msgf("Failed to get more tasks for the queue: %s", err)
}
case s.output <- next:
delete(s.queue, next.Repo.DID)
s.inProgress[next.Repo.DID] = next.Repo
go func(did string, ch chan struct{}) {
select {
case <-ch:
case <-ctx.Done():
}
done <- did
}(next.Repo.DID, next.signal)
s.updateQueueLenMetrics()
case did := <-done:
delete(s.inProgress, did)
s.updateQueueLenMetrics()
}
} else {
select {
case <-ctx.Done():
return
case <-t.C:
if err := s.fillQueue(ctx); err != nil {
log.Error().Err(err).Msgf("Failed to get more tasks for the queue: %s", err)
}
case did := <-done:
delete(s.inProgress, did)
s.updateQueueLenMetrics()
}
}
}
}
func (s *Scheduler) fillQueue(ctx context.Context) error {
const maxQueueLen = 10000
const maxAttempts = 3
if len(s.queue)+len(s.inProgress) >= maxQueueLen {
return nil
}
remotes := []pds.PDS{}
if err := s.db.Find(&remotes).Error; err != nil {
return fmt.Errorf("failed to get the list of PDSs: %w", err)
}
remotes = slices.DeleteFunc(remotes, func(pds pds.PDS) bool {
return pds.Disabled
})
perPDSLimit := maxQueueLen
if len(remotes) > 0 {
perPDSLimit = maxQueueLen * 2 / len(remotes)
}
if perPDSLimit < maxQueueLen/10 {
perPDSLimit = maxQueueLen / 10
}
// Fake remote to account for repos we didn't have a PDS for yet.
remotes = append(remotes, pds.PDS{ID: pds.Unknown})
for _, remote := range remotes {
repos := []repo.Repo{}
err := s.db.Raw(`SELECT * FROM "repos" WHERE pds = ? AND (last_indexed_rev is null OR last_indexed_rev = '') AND failed_attempts < ?
UNION
SELECT "repos".* FROM "repos" left join "pds" on repos.pds = pds.id WHERE pds = ?
AND
(
(first_rev_since_reset is not null AND first_rev_since_reset <> ''
AND last_indexed_rev < first_rev_since_reset)
OR
("repos".first_cursor_since_reset is not null AND "repos".first_cursor_since_reset <> 0
AND "repos".first_cursor_since_reset < "pds".first_cursor_since_reset)
)
AND failed_attempts < ? LIMIT ?`,
remote.ID, maxAttempts, remote.ID, maxAttempts, perPDSLimit).
Scan(&repos).Error
if err != nil {
return fmt.Errorf("querying DB: %w", err)
}
for _, r := range repos {
if s.queue[r.DID] != nil || s.inProgress[r.DID] != nil {
continue
}
copied := r
s.queue[r.DID] = &copied
reposQueued.Inc()
}
s.updateQueueLenMetrics()
}
return nil
}
func (s *Scheduler) updateQueueLenMetrics() {
queueLenght.WithLabelValues("queued").Set(float64(len(s.queue)))
queueLenght.WithLabelValues("inProgress").Set(float64(len(s.inProgress)))
}

View File

@ -1,315 +0,0 @@
package main
import (
"bytes"
"context"
"fmt"
"regexp"
"strings"
"time"
"github.com/imax9000/errors"
"github.com/rs/zerolog"
"gorm.io/gorm"
"gorm.io/gorm/clause"
comatproto "github.com/bluesky-social/indigo/api/atproto"
"github.com/bluesky-social/indigo/util"
"github.com/bluesky-social/indigo/xrpc"
"github.com/uabluerail/bsky-tools/xrpcauth"
"github.com/uabluerail/indexer/models"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/repo"
"github.com/uabluerail/indexer/util/fix"
"github.com/uabluerail/indexer/util/resolver"
)
type WorkItem struct {
Repo *repo.Repo
signal chan struct{}
}
type WorkerPool struct {
db *gorm.DB
input <-chan WorkItem
limiter *Limiter
workerSignals []chan struct{}
resize chan int
}
func NewWorkerPool(input <-chan WorkItem, db *gorm.DB, size int, limiter *Limiter) *WorkerPool {
r := &WorkerPool{
db: db,
input: input,
limiter: limiter,
resize: make(chan int),
}
r.workerSignals = make([]chan struct{}, size)
for i := range r.workerSignals {
r.workerSignals[i] = make(chan struct{})
}
return r
}
func (p *WorkerPool) Start(ctx context.Context) error {
go p.run(ctx)
return nil
}
func (p *WorkerPool) Resize(ctx context.Context, size int) error {
select {
case <-ctx.Done():
return ctx.Err()
case p.resize <- size:
return nil
}
}
func (p *WorkerPool) run(ctx context.Context) {
for _, ch := range p.workerSignals {
go p.worker(ctx, ch)
}
workerPoolSize.Set(float64(len(p.workerSignals)))
for {
select {
case <-ctx.Done():
for _, ch := range p.workerSignals {
close(ch)
}
// also wait for all workers to stop?
return
case newSize := <-p.resize:
switch {
case newSize > len(p.workerSignals):
ch := make([]chan struct{}, newSize-len(p.workerSignals))
for i := range ch {
ch[i] = make(chan struct{})
go p.worker(ctx, ch[i])
}
p.workerSignals = append(p.workerSignals, ch...)
workerPoolSize.Set(float64(len(p.workerSignals)))
case newSize < len(p.workerSignals) && newSize > 0:
for _, ch := range p.workerSignals[newSize:] {
close(ch)
}
p.workerSignals = p.workerSignals[:newSize]
workerPoolSize.Set(float64(len(p.workerSignals)))
}
}
}
}
func (p *WorkerPool) worker(ctx context.Context, signal chan struct{}) {
log := zerolog.Ctx(ctx)
for {
select {
case <-ctx.Done():
return
case <-signal:
return
case work := <-p.input:
updates := &repo.Repo{}
if err := p.doWork(ctx, work); err != nil {
log.Error().Err(err).Msgf("Work task %q failed: %s", work.Repo.DID, err)
updates.LastError = err.Error()
updates.FailedAttempts = work.Repo.FailedAttempts + 1
reposIndexed.WithLabelValues("false").Inc()
} else {
updates.FailedAttempts = 0
reposIndexed.WithLabelValues("true").Inc()
}
updates.LastIndexAttempt = time.Now()
err := p.db.Model(&repo.Repo{}).
Where(&repo.Repo{ID: work.Repo.ID}).
Select("last_error", "last_index_attempt", "failed_attempts").
Updates(updates).Error
if err != nil {
log.Error().Err(err).Msgf("Failed to update repo info for %q: %s", work.Repo.DID, err)
}
}
}
}
func (p *WorkerPool) doWork(ctx context.Context, work WorkItem) error {
log := zerolog.Ctx(ctx)
defer close(work.signal)
u, pubKey, err := resolver.GetPDSEndpointAndPublicKey(ctx, work.Repo.DID)
if err != nil {
return err
}
remote, err := pds.EnsureExists(ctx, p.db, u.String())
if err != nil {
return fmt.Errorf("failed to get PDS records for %q: %w", u, err)
}
if work.Repo.PDS != remote.ID {
if err := p.db.Model(&work.Repo).Where(&repo.Repo{ID: work.Repo.ID}).Updates(&repo.Repo{PDS: remote.ID}).Error; err != nil {
return fmt.Errorf("failed to update repo's PDS to %q: %w", u, err)
}
work.Repo.PDS = remote.ID
}
client := xrpcauth.NewAnonymousClient(ctx)
client.Host = u.String()
client.Client = util.RobustHTTPClient()
client.Client.Timeout = 30 * time.Minute
knownCursorBeforeFetch := remote.FirstCursorSinceReset
retry:
if p.limiter != nil {
if err := p.limiter.Wait(ctx, u.String()); err != nil {
return fmt.Errorf("failed to wait on rate limiter: %w", err)
}
}
// TODO: add a configuration knob for switching between full and partial fetch.
sinceRev := work.Repo.LastIndexedRev
b, err := comatproto.SyncGetRepo(ctx, client, work.Repo.DID, sinceRev)
if err != nil {
if err, ok := errors.As[*xrpc.Error](err); ok {
if err.IsThrottled() && err.Ratelimit != nil {
log.Debug().Str("pds", u.String()).Msgf("Hit a rate limit (%s), sleeping until %s", err.Ratelimit.Policy, err.Ratelimit.Reset)
time.Sleep(time.Until(err.Ratelimit.Reset))
goto retry
}
}
reposFetched.WithLabelValues(u.String(), "false").Inc()
return fmt.Errorf("failed to fetch repo: %w", err)
}
if len(b) == 0 {
reposFetched.WithLabelValues(u.String(), "false").Inc()
return fmt.Errorf("PDS returned zero bytes")
}
reposFetched.WithLabelValues(u.String(), "true").Inc()
if work.Repo.PDS == pds.Unknown {
remote, err := pds.EnsureExists(ctx, p.db, u.String())
if err != nil {
return err
}
work.Repo.PDS = remote.ID
if err := p.db.Model(&work.Repo).Where(&repo.Repo{ID: work.Repo.ID}).Updates(&repo.Repo{PDS: work.Repo.PDS}).Error; err != nil {
return fmt.Errorf("failed to set repo's PDS: %w", err)
}
}
newRev, err := repo.GetRev(ctx, bytes.NewReader(b))
if sinceRev != "" && errors.Is(err, repo.ErrZeroBlocks) {
// No new records since the rev we requested above.
if work.Repo.FirstCursorSinceReset < knownCursorBeforeFetch {
if err := p.bumpFirstCursorSinceReset(work.Repo.ID, knownCursorBeforeFetch); err != nil {
return fmt.Errorf("updating first_cursor_since_reset: %w", err)
}
}
return nil
} else if err != nil {
l := 25
if len(b) < l {
l = len(b)
}
log.Debug().Err(err).Msgf("Total bytes fetched: %d. First few bytes: %q", len(b), string(b[:l]))
return fmt.Errorf("failed to read 'rev' from the fetched repo: %w", err)
}
newRecs, err := repo.ExtractRecords(ctx, bytes.NewReader(b), pubKey)
if err != nil {
return fmt.Errorf("failed to extract records: %w", err)
}
recs := []repo.Record{}
for k, v := range newRecs {
parts := strings.SplitN(k, "/", 2)
if len(parts) != 2 {
log.Warn().Msgf("Unexpected key format: %q", k)
continue
}
v = regexp.MustCompile(`[^\\](\\\\)*(\\u0000)`).ReplaceAll(v, []byte(`$1<0x00>`))
recs = append(recs, repo.Record{
Repo: models.ID(work.Repo.ID),
Collection: parts[0],
Rkey: parts[1],
// XXX: proper replacement of \u0000 would require full parsing of JSON
// and recursive iteration over all string values, but this
// should work well enough for now.
Content: fix.EscapeNullCharForPostgres(v),
AtRev: newRev,
})
}
recordsFetched.Add(float64(len(recs)))
if len(recs) > 0 {
for _, batch := range splitInBatshes(recs, 500) {
result := p.db.Model(&repo.Record{}).
Clauses(clause.OnConflict{
Where: clause.Where{Exprs: []clause.Expression{
clause.Neq{
Column: clause.Column{Name: "content", Table: "records"},
Value: clause.Column{Name: "content", Table: "excluded"}},
clause.Or(
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: nil},
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: ""},
clause.Lt{
Column: clause.Column{Name: "at_rev", Table: "records"},
Value: clause.Column{Name: "at_rev", Table: "excluded"}},
)}},
DoUpdates: clause.AssignmentColumns([]string{"content", "at_rev"}),
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
Create(batch)
if err := result.Error; err != nil {
return fmt.Errorf("inserting records into the database: %w", err)
}
recordsInserted.Add(float64(result.RowsAffected))
}
}
err = p.db.Model(&repo.Repo{}).Where(&repo.Repo{ID: work.Repo.ID}).
Updates(&repo.Repo{LastIndexedRev: newRev}).Error
if err != nil {
return fmt.Errorf("updating repo rev: %w", err)
}
if work.Repo.FirstCursorSinceReset < knownCursorBeforeFetch {
if err := p.bumpFirstCursorSinceReset(work.Repo.ID, knownCursorBeforeFetch); err != nil {
return fmt.Errorf("updating first_cursor_since_reset: %w", err)
}
}
// TODO: check for records that are missing in the repo download
// and mark them as deleted.
return nil
}
// bumpFirstCursorSinceReset increases repo's FirstCursorSinceReset iff it is currently lower than the supplied value.
func (p *WorkerPool) bumpFirstCursorSinceReset(repoId models.ID, cursorValue int64) error {
return p.db.Transaction(func(tx *gorm.DB) error {
var currentCursor int64
err := tx.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoId}).
Select("first_cursor_since_reset").First(&currentCursor).Error
if err != nil {
return fmt.Errorf("failed to get current cursor value: %w", err)
}
if currentCursor < cursorValue {
return tx.Model(&repo.Repo{}).Where(&repo.Repo{ID: repoId}).
Updates(&repo.Repo{FirstCursorSinceReset: cursorValue}).Error
}
return nil
})
}
func splitInBatshes[T any](s []T, batchSize int) [][]T {
var r [][]T
for i := 0; i < len(s); i += batchSize {
if i+batchSize < len(s) {
r = append(r, s[i:i+batchSize])
} else {
r = append(r, s[i:])
}
}
return r
}

View File

@ -1,14 +0,0 @@
FROM golang:1.22.3 as builder
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . ./
RUN go build -trimpath ./cmd/update-db-schema
FROM alpine:latest as certs
RUN apk --update add ca-certificates
FROM debian:stable-slim
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
COPY --from=builder /app/update-db-schema .
ENTRYPOINT ["./update-db-schema"]

View File

@ -1,7 +0,0 @@
*
**/*
!go.mod
!go.sum
!**/*.go
cmd/**
!cmd/update-db-schema

View File

@ -1,155 +0,0 @@
package main
import (
"context"
"flag"
"fmt"
"io"
"log"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
"runtime/debug"
"strings"
"syscall"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/kelseyhightower/envconfig"
"github.com/rs/zerolog"
"gorm.io/driver/postgres"
"gorm.io/gorm"
"gorm.io/gorm/logger"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/repo"
"github.com/uabluerail/indexer/util/gormzerolog"
)
type Config struct {
LogFile string
LogFormat string `default:"text"`
LogLevel int64 `default:"1"`
DBUrl string `envconfig:"POSTGRES_URL"`
}
var config Config
func runMain(ctx context.Context) error {
ctx = setupLogging(ctx)
log := zerolog.Ctx(ctx)
log.Debug().Msgf("Starting up...")
db, err := gorm.Open(postgres.Open(config.DBUrl), &gorm.Config{
Logger: gormzerolog.New(&logger.Config{
SlowThreshold: 1 * time.Second,
IgnoreRecordNotFoundError: true,
}, nil),
})
if err != nil {
return fmt.Errorf("connecting to the database: %w", err)
}
log.Debug().Msgf("DB connection established")
for _, f := range []func(*gorm.DB) error{
pds.AutoMigrate,
repo.AutoMigrate,
} {
if err := f(db); err != nil {
return fmt.Errorf("auto-migrating DB schema: %w", err)
}
}
log.Debug().Msgf("DB schema updated")
return nil
}
func main() {
flag.StringVar(&config.LogFile, "log", "", "Path to the log file. If empty, will log to stderr")
flag.StringVar(&config.LogFormat, "log-format", "text", "Logging format. 'text' or 'json'")
flag.Int64Var(&config.LogLevel, "log-level", 1, "Log level. -1 - trace, 0 - debug, 1 - info, 5 - panic")
if err := envconfig.Process("update-db-schema", &config); err != nil {
log.Fatalf("envconfig.Process: %s", err)
}
flag.Parse()
ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
if err := runMain(ctx); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func setupLogging(ctx context.Context) context.Context {
logFile := os.Stderr
if config.LogFile != "" {
f, err := os.OpenFile(config.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatalf("Failed to open the specified log file %q: %s", config.LogFile, err)
}
logFile = f
}
var output io.Writer
switch config.LogFormat {
case "json":
output = logFile
case "text":
prefixList := []string{}
info, ok := debug.ReadBuildInfo()
if ok {
prefixList = append(prefixList, info.Path+"/")
}
basedir := ""
_, sourceFile, _, ok := runtime.Caller(0)
if ok {
basedir = filepath.Dir(sourceFile)
}
if basedir != "" && strings.HasPrefix(basedir, "/") {
prefixList = append(prefixList, basedir+"/")
head, _ := filepath.Split(basedir)
for head != "/" {
prefixList = append(prefixList, head)
head, _ = filepath.Split(strings.TrimSuffix(head, "/"))
}
}
output = zerolog.ConsoleWriter{
Out: logFile,
NoColor: true,
TimeFormat: time.RFC3339,
PartsOrder: []string{
zerolog.LevelFieldName,
zerolog.TimestampFieldName,
zerolog.CallerFieldName,
zerolog.MessageFieldName,
},
FormatFieldName: func(i interface{}) string { return fmt.Sprintf("%s:", i) },
FormatFieldValue: func(i interface{}) string { return fmt.Sprintf("%s", i) },
FormatCaller: func(i interface{}) string {
s := i.(string)
for _, p := range prefixList {
s = strings.TrimPrefix(s, p)
}
return s
},
}
default:
log.Fatalf("Invalid log format specified: %q", config.LogFormat)
}
logger := zerolog.New(output).Level(zerolog.Level(config.LogLevel)).With().Caller().Timestamp().Logger()
ctx = logger.WithContext(ctx)
zerolog.DefaultContextLogger = &logger
log.SetOutput(logger)
return ctx
}

View File

@ -1,320 +0,0 @@
# Data consistency model
## Indicators received from upstream
We have two interconnected strictly ordered values: `rev` and cursor. `rev` is
local to each repo, cursor provides additional ordering across all repos hosted
on a PDS.
### `rev`
String value, sequencing each commit within a given repo. Each next commit must
have a `rev` value strictly greater than the previous commit.
### Cursor
Integer number, sent with each message in firehose. Must be strictly increasing.
Messages also contain `rev` value for the corresponding repo event, and we
assume that within each repo all commits with smaller `rev` values also were
sent with smaller cursor values. That is, cursor sequences all events recorded
by the PDS and we assume that events of each given repo are sent in proper
order.
#### Cursor reset
"Cursor reset" is a situation where upon reconnecting to a PDS we find out that
the PDS is unable to send us all events that happened since the cursor value we
have recorded. It is **Very Bad**™, because we have no idea what events did we
miss between our recorded cursor and the new cursor that PDS has sent us.
This gap in data from a PDS must be addressed somehow, and most of this document
revolves around detecting when a given repo is affected by a cursor reset and
how to recover missing data with minimal effort.
## Available operations
### Repo fetch
We can fetch a full copy of a repo. Each commit contains a `rev` - string value
that is strictly increasing with each new commit.
We also have the option to only fetch records created after a particular `rev` -
this is useful for reducing the amount of data received when we already have
some of the records.
### Consuming firehose
We can stream new events from each PDS. Every event comes with a cursor value -
integer number that is strictly increasing, scoped to a PDS. Events also contain
repo-specific `rev` which is the same with a full repo fetch.
## High-level overview
With `rev` imposing strict ordering on repo operations, we maintain the
following two indicators for each repo:
1. `LastCompleteRev` - largest `rev` value that we are sure we have the complete
set of records at. For example, we can set this after processing the output
of `getRepo` call.
2. `FirstUninterruptedFirehoseRev` - smallest `rev` value from which we are sure
to have a complete set of records up until ~now.
These indicators define two intervals of `rev` values (`(-Infinity,
LastCompleteRev]`, `[FirstUninterruptedFirehoseRev, now)`) that we assume to
have already processed. If these intervals overlap - we assume that we've
covered `(-Infinity, now)`, i.e., have a complete set of records of a given
repo. If they don't overlap - we might have missed some records, and can
remediate that by fetching the whole repo, indexing records we don't have and
updating `LastCompleteRev`.
Both of these indicators should never decrease. When a PDS tells us that our
cursor value is invalid, we move `FirstUninterruptedFirehoseRev` forward, which
in turn can make the above intervals non-overlapping.
These indicators also can be uninitialized, which means that we have no data
about the corresponding interval.
Note that for performance and feasibility reasons we don't store these two
indicators in the database directly. Instead, to minimize the number of writes,
we derive them from a few other values.
### Updating `LastCompleteRev`
We can move `LastCompleteRev` forward when either:
* We just indexed a full repo checkout
* We got a new record from firehose AND the repo currently has no gaps
(`LastCompleteRev` >= `FirstUninterruptedFirehoseRev`)
### Updating `FirstUninterruptedFirehoseRev`
Once initialized, stays constant during normal operation. Can move forward if a
PDS informs us that we missed some records and it can't replay all of them (and
resets our cursor).
## Handling cursor resets
### Naive approach
We could store `FirstUninterruptedFirehoseRev` in a column for each repo, and
when we detect a cursor reset - unset it for every repo from a particular PDS.
There are a couple of issues with this:
1. Cursor reset will trigger a lot of writes: row for each repo from the
affected PDS will have to be updated.
2. We have no information about `[FirstUninterruptedFirehoseRev, now)` interval
until we see a new commit for a repo, which might take a long time, or never
happen at all.
### Reducing the number of writes
We can rely on the firehose cursor value imposing additional ordering on
commits.
1. Start tracking firehose stream continuity by storing
`FirstUninterruptedCursor` for each PDS
2. When receiving a commit from firehose, compare `FirstUninterruptedCursor`
between repo and PDS entries:
* If `Repo`.`FirstUninterruptedCursor` < `PDS`.`FirstUninterruptedCursor`,
set `FirstUninterruptedFirehoseRev` to the commit's `rev` and copy
`FirstUninterruptedCursor` from PDS entry.
Now during a cursor reset we need to only change `FirstUninterruptedCursor` in
the PDS entry. And if `Repo`.`FirstUninterruptedCursor` <
`PDS`.`FirstUninterruptedCursor` - we know that repo's hosting PDS reset our
cursor at some point and `FirstUninterruptedFirehoseRev` value is no longer
valid.
### Avoiding long wait for the first firehose event
We can fetch the full repo to index any missing records and advance
`LastCompleteRev` accordingly. But if we don't update
`Repo`.`FirstUninterruptedCursor` - it will stay smaller than
`PDS`.`FirstUninterruptedCursor` and `FirstUninterruptedFirehoseRev` will remain
invalid.
We can fix that with an additional assumption: PDS provides strong consistency
between the firehose and `getRepo` - if we have already seen cursor value `X`,
then `getRepo` response will be up to date with all commits corresponding to
cursor values smaller or equal to `X`.
1. Before fetching the repo, note the current `FirstUninterruptedCursor` value
of the repo's hosting PDS. (Or even the latest `Cursor` value)
2. Fetch and process the full repo checkout, setting `LastCompleteRev`
3. If `Repo`.`FirstUninterruptedCursor` < `PDS`.`FirstUninterruptedCursor` still
holds (i.e., no new records on firehose while we were re-indexing), then set
`Repo`.`FirstUninterruptedCursor` to the cursor value recorded in step 1.
With the above assumption, all records that happened between
`FirstUninterruptedFirehoseRev` and this cursor value were already processed
in step 2, so `FirstUninterruptedFirehoseRev` is again valid, until
`PDS`.`FirstUninterruptedCursor` moves forward again.
## Repo discovery
We have the ability to get a complete list of hosted repos from a PDS. The
response includes last known `rev` for each repo, but does not come attached
with a firehose cursor value. We're assuming here the same level of consistency
as with `getRepo`, and can initialize `Repo`.`FirstUninterruptedCursor` with the
value from the PDS entry recorded before making the call to list repos, and
`FirstUninterruptedFirehoseRev` to the returned `rev`.
TODO: consider if it's worth to not touch cursor/`rev` values here and offload
initializing them to indexing step described above.
## Updating `LastCompleteRev` based on firehose events
We have the option to only advance `LastCompleteRev` when processing the full
repo checkout. While completely valid, it's rather pessimistic in that, in
absence of cursor resets, this value will remain arbitrarily old despite us
actually having a complete set of records for the repo. Consequently, when a
cursor reset eventually does happen - we'll be assuming that we're missing much
more records than we actually do.
Naively, we can simply update `LastCompleteRev` on every event (iff the
completeness intervals are currently overlapping). The drawback is that each
event, in addition to new record creation, will update the corresponding repo
entry. If we could avoid this, it would considerably reduce the number of
writes.
### Alternative 1: delay updates
We can delay updating `LastCompleteRev` from firehose events for some time and
elide multiple updates to the same repo into a single write. Delay duration
would have to be at least on the order of minutes for this to be effective,
since writes to any single repo are usually initiated by human actions and have
a very low rate.
This way we can trade some RAM for reduction in writes.
### Alternative 2: skip frequent updates
Similar to the above, but instead of delaying updates, simply skip them if last
update was recent enough. This will often result in `LastCompleteRev` not
reflecting *actual* last complete `rev` for a repo, but it will keep it recent
enough.
## Detailed design
### Bad naming
In the implementation not enough attention was paid to naming things, and their
usage and meaning slightly changed over time, so in the sections below and in
the code some of the things mentioned above are named differently:
* `LastCompleteRev` - max(`LastIndexedRev`, `LastFirehoseRev`)
* `FirstUninterruptedCursor` - `FirstCursorSinceReset`
* `FirstUninterruptedFirehoseRev` - `FirstRevSinceReset`
### Metadata fields
#### PDS
* `Cursor` - last cursor value received from this PDS.
* `FirstCursorSinceReset` - earliest cursor we have uninterrupted sequence of
records up to now.
#### Repo
* `LastIndexedRev` - last `rev` recorded during most recent full repo re-index
* Up to this `rev` we do have all records
* `FirstRevSinceReset` - first `rev` seen on firehose since the most recent
cursor reset.
* Changes only when an event for this repo is received, so it alone doesn't
guarantee that we have all subsequent records
* `FirstCursorSinceReset` - copy of the PDS field with the same name.
* If `FirstCursorSinceReset` >= `PDS`.`FirstCursorSinceReset` and PDS's
firehose is live - then we indeed have all records since
`FirstRevSinceReset`
* `LastFirehoseRev` - last `rev` seen on the firehose while we didn't have any
interruptions
### Guarantees
* Up to and including `LastIndexedRev` - all records have been indexed.
* If `LastFirehoseRev` is set - all records up to and including it have been
indexed.
* If `FirstCursorSinceReset` >= `PDS`.`FirstCursorSinceReset`:
* Starting from and including `FirstRevSinceReset` - we have indexed all newer
records
* Consequently, if max(`LastIndexedRev`, `LastFirehoseRev`) >=
`FirstRevSinceReset` - we have a complete copy of the repo
* If `FirstCursorSinceReset` < `PDS`.`FirstCursorSinceReset`:
* There was a cursor reset, we might be missing some records after
`FirstRevSinceReset`
* `FirstCursorSinceReset` on both repos and PDSs never gets rolled back
* `LastIndexedRev` never gets rolled back
### Operations
#### Indexing a repo
* Resolve the current PDS hosting the repo and store its `FirstCursorSinceReset`
in a variable
* If the PDS is different from the one we have on record (i.e., the repo
migrated) - update accordingly
* Fetch the repo
* Upsert all fetched records
* Set `LastIndexedRev` to `rev` of the fetched repo
* In a transaction check if `Repo`.`FirstCursorSinceReset` >= the value stored
in the first step, and set it to that value if it isn't.
* Assumption here is that a PDS returns strongly consistent responses for a
single repo, and fetching the repo will include all records corresponding to
a cursor value generated before that.
#### Connecting to firehose
* If the first message is `#info` - this means that our cursor is too old
* Update PDS's `FirstCursorSinceReset` to the value supplied in the `#info`
message
Workaround for a buggy relay that doesn't send `#info`:
* If the first message has cursor value that is different from `Cursor`+1:
* Assume there was a cursor reset and update PDS's `FirstCursorSinceReset` to
the value provided in the message
#### Receiving event on firehose
* Check that the event is coming from the correct PDS for a given repo
* TODO: maybe drop this and just check the signature
* Process the event normally
* If `Repo`.`FirstCursorSinceReset` >= `PDS`.`FirstCursorSinceReset`:
* Update `LastFirehoseRev` to event's `rev`
* If `Repo`.`FirstCursorSinceReset` < `PDS`.`FirstCursorSinceReset`:
* Set repo's `FirstRevSinceReset` to the event's `rev` and
`FirstCursorSinceReset` to `PDS`.`FirstCursorSinceReset`
* If `tooBig` flag is set on the message (MST diff was larger than PDS's size
limit, so some records were dropped):
* Set repo's `FirstRevSinceReset` to the event's `rev` and
`FirstCursorSinceReset` to `PDS`.`FirstCursorSinceReset`
* Note: `FirstCursorSinceReset` might be the same, but moving forward
`FirstRevSinceReset` likely will trigger repo reindexing
* Update PDS's `Cursor` to the value provided in the message
#### Listing repos
* Fetch a list of repos from a PDS. Response also includes the last `rev` for
every repo.
* For each repo:
* If `FirstRevSinceReset` is not set:
* Set `FirstRevSinceReset` to received `rev`
* Set `FirstCursorSinceReset` to the PDS's `FirstCursorSinceReset`
#### Repo migrating to a different PDS
TODO
Currently we're simply resetting `FirstRevSinceReset`.
#### Finding repos that need indexing
* Repo index is incomplete and needs to be indexed if one of these is true:
* `LastIndexedRev` is not set
* max(`LastIndexedRev`, `LastFirehoseRev`) < `FirstRevSinceReset`
* `Repo`.`FirstCursorSinceReset` < `PDS`.`FirstCursorSinceReset`

View File

@ -1,62 +0,0 @@
#!/bin/sh
set -e
# ------------------------------ Write data timestamp ----------------------------------
echo "export_start" > timestamp.csv
date -Iseconds --utc >> timestamp.csv
# ------------------------------ Refresh views ----------------------------------
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
\timing
\echo Refreshing follows...
refresh materialized view export_follows;
\echo Refreshing like counts...
refresh materialized view export_likes;
\echo Refreshing reply counts...
refresh materialized view export_replies;
\echo Refreshing block list...
refresh materialized view export_blocks;
\echo Refreshing DID list...
refresh materialized view export_dids;
EOF
# ------------------------------ Dump views into .csv ----------------------------------
echo "Writing .csv files..."
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_follows) to stdout with csv header;" > follows.csv
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_likes) to stdout with csv header;" > like_counts.csv
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_replies) to stdout with csv header;" > post_counts.csv
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_blocks) to stdout with csv header;" > blocks.csv
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_dids) to stdout with csv header;" > dids.csv
# ------------------------------ Free up space used by materialized views ----------------------------------
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
\timing
refresh materialized view export_follows with no data;
refresh materialized view export_likes with no data;
refresh materialized view export_replies with no data;
refresh materialized view export_blocks with no data;
refresh materialized view export_dids with no data;
EOF
# ------------------------------ Dump handles from plc-mirror ----------------------------------
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > handles.csv
\timing
select did as "did:ID", replace(operation['alsoKnownAs'] ->> 0, 'at://', '') as handle
from plc_log_entries
where (did, plc_timestamp) in (
select did, max(plc_timestamp) as plc_timestamp from plc_log_entries
where not nullified
group by did
)
EOF

View File

@ -1,120 +0,0 @@
#!/bin/bash
source .env
set -e
# ------------------------------ Write data timestamp ----------------------------------
date=$(date -Idate --utc)
mkdir -p ${CSV_DIR}/full
mkdir -p ${CSV_DIR}/full/${date}
echo "Output directory: ${CSV_DIR}/full/${date}"
to_timestamp=$(date -Iseconds --utc)
echo "export_start" > ${CSV_DIR}/full/${date}/timestamp.csv
echo "${to_timestamp}" >> ${CSV_DIR}/full/${date}/timestamp.csv
# ------------------------------ Refresh views ----------------------------------
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
\timing
\echo Refreshing follows...
refresh materialized view export_follows;
\echo Refreshing like counts...
refresh materialized view export_likes_ladder;
\echo Refreshing reply counts...
refresh materialized view export_replies_ladder;
\echo Refreshing block list...
refresh materialized view export_blocks;
\echo Refreshing DID list...
refresh materialized view export_dids_ladder;
\echo Refreshing optout list...
refresh materialized view export_optouts;
EOF
# ------------------------------ Dump views into .csv ----------------------------------
echo "Writing .csv files..."
echo "Starting follows export..."
folows_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_follows) to stdout with csv header;" > ${CSV_DIR}/full/${date}/follows.csv
echo "Finishing follows export..."
folows_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$folows_finished' where started='$folows_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.follow'"
echo "Starting blocks export..."
block_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_blocks) to stdout with csv header;" > ${CSV_DIR}/full/${date}/blocks.csv
echo "Finishing blocks export..."
block_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$block_finished' where started='$block_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.block'"
echo "Starting likes export..."
likes_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_likes_ladder) to stdout with csv header;" > ${CSV_DIR}/full/${date}/like_counts.csv
echo "Finishing likes export..."
likes_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$likes_finished' where started='$likes_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.like'"
echo "Starting posts export..."
posts_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_replies_ladder) to stdout with csv header;" > ${CSV_DIR}/full/${date}/post_counts.csv
echo "Finishing posts export..."
posts_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$posts_finished' where started='$posts_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.post'"
echo "Starting dids export..."
dids_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_dids_ladder) to stdout with csv header;" > ${CSV_DIR}/full/${date}/dids.csv
echo "Finishing dids export..."
dids_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$dids_finished' where started='$dids_started' and to_tsmp='$to_timestamp' and collection = 'did'"
echo "Starting optouts export..."
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/full/${date}/optout.csv
echo "Finishing optouts export..."
# ------------------------------ DO NOT Free up space used by materialized views for incremental refresh ----------------------------------
# ------------------------------ Dump handles from plc-mirror ----------------------------------
echo "Starting handles export..."
handles_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle')"
docker exec -t plc-postgres-1 psql -U postgres -d plc \
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/full/${date}/handles.csv
echo "Finishing handles export..."
handles_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle'"
echo "Export finished."

View File

@ -1,118 +0,0 @@
#!/bin/bash
source .env
set -e
# ------------------------------ Write data timestamp ----------------------------------
date=$(date -Idate --utc)
mkdir -p ${CSV_DIR}/monthly
mkdir -p ${CSV_DIR}/monthly/${date}
echo "Output directory: ${CSV_DIR}/monthly/${date}"
to_timestamp=$(date -Iseconds --utc)
echo "export_start" > ${CSV_DIR}/monthly/${date}/timestamp.csv
echo "${to_timestamp}" >> ${CSV_DIR}/monthly/${date}/timestamp.csv
# ------------------------------ Refresh views ----------------------------------
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
\timing
\echo Refreshing follows...
refresh materialized view export_follows_month;
\echo Refreshing like counts...
refresh materialized view export_likes_month;
\echo Refreshing reply counts...
refresh materialized view export_replies_month;
\echo Refreshing block list...
refresh materialized view export_blocks_month;
\echo Refreshing DID list...
refresh materialized view export_dids_month;
\echo Refreshing optout list...
refresh materialized view export_optouts;
EOF
# ------------------------------ Dump views into .csv ----------------------------------
echo "Writing .csv files..."
echo "Starting follows export..."
folows_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow_month')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_follows_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/follows.csv
echo "Finishing follows export..."
folows_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$folows_finished' where started='$folows_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.follow_month'"
echo "Starting blocks export..."
block_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block_month')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_blocks_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/blocks.csv
echo "Finishing blocks export..."
block_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$block_finished' where started='$block_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.block_month'"
echo "Starting likes export..."
likes_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like_month')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_likes_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/like_counts.csv
echo "Finishing likes export..."
likes_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$likes_finished' where started='$likes_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.like_month'"
echo "Starting posts export..."
posts_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post_month')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_replies_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/post_counts.csv
echo "Finishing posts export..."
posts_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$posts_finished' where started='$posts_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.post_month'"
echo "Starting dids export..."
dids_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did_month')"
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select * from export_dids_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/dids.csv
echo "Finishing dids export..."
dids_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$dids_finished' where started='$dids_started' and to_tsmp='$to_timestamp' and collection = 'did_month'"
echo "Starting optouts export..."
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/optout.csv
echo "Finishing optouts export..."
# ------------------------------ DO NOT Free up space used by materialized views for incremental refresh ----------------------------------
# ------------------------------ Dump handles from plc-mirror ----------------------------------
echo "Starting handles export..."
handles_started=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle_month')"
docker exec -t plc-postgres-1 psql -U postgres -d plc \
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/monthly/${date}/handles.csv
echo "Finishing handles export..."
handles_finished=$(date -Iseconds --utc)
docker compose exec -it postgres psql -U postgres -d bluesky \
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle_month'"
echo "Export finished."

View File

@ -1,28 +0,0 @@
.dashboard | . as $dash
| [paths(type == "object"
and (.datasource?.uid? | type) == "string"
and .datasource.type? == "prometheus")] as $uids
| reduce $uids[] as $path ([]; ($dash | getpath($path).datasource.uid) as $uid | if [.[] == $uid] | any then . else . + [$uid] end)
| . as $unique_uids
| [range($unique_uids | length) | {key: $unique_uids[.], value: "DS\(.+1)"}]
| from_entries as $uid_map
| reduce $uids[] as $path ($dash; setpath($path + ["datasource", "uid"]; "${\($uid_map[getpath($path).datasource.uid])}"))
| reduce paths(type == "object" and has("current") and has("datasource"))
as $path (.; setpath($path + ["current"]; {}))
| .id = null
| .__inputs = [$unique_uids[] | {
name: $uid_map[.],
label: "Prometheus",
description: "",
type: "datasource",
pluginId: "prometheus",
pluginName: "Prometheus",
}]
| .__requires = []
| .__elements = {}

View File

@ -1,13 +0,0 @@
#!/bin/sh
set -e
cd "$(dirname "$0")"
. ../.env
: ${DASHBOARD_NAME:=indexer}
: ${DASHBOARD_UID:="$(jq -r .uid "${DASHBOARD_NAME}.json")"}
curl -s --fail-with-body "${GRAFANA_URL}/api/dashboards/uid/${DASHBOARD_UID}" | jq --sort-keys -f export.jq > "${DASHBOARD_NAME}.json"

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +0,0 @@
$current[0].dashboard as $cur
| ([$cur | .. | select(.datasource?.type? == "prometheus")] | first | .datasource.uid) as $datasource
| .templating.list = [
.templating.list[] | .name as $name
| .current = ($cur.templating.list[] | select(.name == $name) | .current) // {}
]
| . as $dash
| [paths(type == "object"
and .datasource.type? == "prometheus")] as $uids
| reduce $uids[] as $path ($dash; setpath($path + ["datasource", "uid"]; $datasource))
| .id = $cur.id
| .version = $cur.version
| {dashboard: ., overwrite: false}

View File

@ -1,24 +0,0 @@
#!/bin/sh
set -e
cd "$(dirname "$0")"
. ../.env
: ${DASHBOARD_NAME:=indexer}
: ${DASHBOARD_UID:="$(jq -r .uid "${DASHBOARD_NAME}.json")"}
if ! curl -X HEAD -s --fail-with-body "${GRAFANA_URL}/api/dashboards/uid/${DASHBOARD_UID}"; then
echo "Dashboard with UID ${DASHBOARD_UID} is not found. Please import $(dirname "$0")/${DASHBOARD_NAME}.json once, and later use this command again to update it." >&2
exit 1
fi
CUR_DASHBOARD="$(mktemp -t "${DASHBOARD_NAME}.json.XXXXXXX")"
curl -s --fail-with-body "${GRAFANA_URL}/api/dashboards/uid/${DASHBOARD_UID}" > "${CUR_DASHBOARD}"
jq --slurpfile current "${CUR_DASHBOARD}" \
-f update.jq "${DASHBOARD_NAME}.json" \
| curl --json @- -s --fail-with-body "${GRAFANA_URL}/api/dashboards/db"
rm "${CUR_DASHBOARD}"

View File

@ -1,5 +0,0 @@
# DB migrations
WARNING: due to partitioning schema changes (which require re-creating the
tables from scratch), some migrations were **edited**. Their previous versions
have been copied to `migrations/obsolete` folder.

View File

@ -1,22 +0,0 @@
insert into pds (host) values ('https://agaric.us-west.host.bsky.network'),
('https://amanita.us-east.host.bsky.network'),
('https://blewit.us-west.host.bsky.network'),
('https://boletus.us-west.host.bsky.network'),
('https://bsky.social'),
('https://chaga.us-west.host.bsky.network'),
('https://conocybe.us-west.host.bsky.network'),
('https://enoki.us-east.host.bsky.network'),
('https://hydnum.us-west.host.bsky.network'),
('https://inkcap.us-east.host.bsky.network'),
('https://lepista.us-west.host.bsky.network'),
('https://lionsmane.us-east.host.bsky.network'),
('https://maitake.us-west.host.bsky.network'),
('https://morel.us-east.host.bsky.network'),
('https://oyster.us-east.host.bsky.network'),
('https://porcini.us-east.host.bsky.network'),
('https://puffball.us-east.host.bsky.network'),
('https://russula.us-west.host.bsky.network'),
('https://shiitake.us-east.host.bsky.network'),
('https://shimeji.us-east.host.bsky.network'),
('https://verpa.us-west.host.bsky.network')
on conflict do nothing;

View File

@ -1,93 +0,0 @@
\timing
CREATE EXTENSION pg_partman SCHEMA public;
alter table records rename to records_like;
create table records
(like records_like including defaults)
partition by list (collection);
drop index idx_repo_record_key;
drop index idx_repo_rev;
alter sequence records_id_seq owned by records.id;
drop table records_like;
create index on records (collection, repo, rkey);
CREATE OR REPLACE FUNCTION setup_partition(in collection text, in suffix text) RETURNS boolean AS $$
BEGIN
EXECUTE 'CREATE TABLE records_' || suffix ||
' PARTITION OF records FOR VALUES IN (' || quote_literal(collection) || ')
PARTITION BY RANGE (created_at)';
EXECUTE 'CREATE INDEX ON records_' || suffix || ' (created_at)';
EXECUTE 'alter table records_' || suffix || ' add check (collection = ' || quote_literal(collection) || ')';
PERFORM public.create_parent('public.records_' || suffix, 'created_at', '1 month',
p_start_partition := '2024-02-01');
RETURN true;
END;
$$ LANGUAGE plpgsql;
select setup_partition('app.bsky.feed.like', 'like');
select setup_partition('app.bsky.feed.post', 'post');
select setup_partition('app.bsky.graph.follow', 'follow');
select setup_partition('app.bsky.graph.block', 'block');
select setup_partition('app.bsky.feed.repost', 'repost');
select setup_partition('app.bsky.actor.profile', 'profile');
select setup_partition('app.bsky.graph.list', 'list');
select setup_partition('app.bsky.graph.listblock', 'listblock');
select setup_partition('app.bsky.graph.listitem', 'listitem');
CREATE TABLE records_default
PARTITION OF records DEFAULT
PARTITION BY RANGE (created_at);
CREATE INDEX ON records_default (created_at);
SELECT public.create_parent('public.records_default', 'created_at', '1 month',
p_start_partition := '2024-02-01');
create index idx_like_subject
on records_like
(split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3));
create index idx_follow_subject
on records_follow
(jsonb_extract_path_text(content, 'subject'));
create index idx_reply_subject
on records_post
(split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3));
create index listitem_uri_subject
on records_listitem
(
jsonb_extract_path_text(content, 'list'),
jsonb_extract_path_text(content, 'subject'))
include (deleted);
create index listitem_subject_uri
on records_listitem
(
jsonb_extract_path_text(content, 'subject'),
jsonb_extract_path_text(content, 'list'))
include (deleted);
create view listitems as
select *, jsonb_extract_path_text(content, 'list') as list,
jsonb_extract_path_text(content, 'subject') as subject
from records_listitem;
create view lists as
select records_list.*,
jsonb_extract_path_text(content, 'name') as name,
jsonb_extract_path_text(content, 'description') as description,
jsonb_extract_path_text(content, 'purpose') as purpose,
'at://' || repos.did || '/app.bsky.graph.list/' || rkey as uri
from records_list join repos on records_list.repo = repos.id;

View File

@ -1,28 +0,0 @@
create index post_langs on records_post using gin (jsonb_extract_path(content, 'langs') jsonb_ops);
-- There are invalid/non-conforming values that need to be handled somehow.
create function parse_timestamp(text)
returns timestamp
returns null on null input
immutable
as
$$
begin
begin
return $1::timestamp;
exception
when others then
return null;
end;
end;
$$
language plpgsql;
create index post_created_at on records_post (parse_timestamp(jsonb_extract_path_text(content, 'createdAt')));
create view posts as
select *, jsonb_extract_path(content, 'langs') as langs,
parse_timestamp(jsonb_extract_path_text(content, 'createdAt')) as content_created_at
from records_post;
explain select count(*) from posts where langs ? 'uk' and content_created_at > now() - interval '1 day';

View File

@ -1,46 +0,0 @@
-- Create a bunch of materialized views, but don't populate them right away.
create materialized view export_follows
as select repos.did as ":START_ID",
records.content ->> 'subject' as ":END_ID"
from repos join records on repos.id = records.repo
where records.collection = 'app.bsky.graph.follow'
and records.content ->> 'subject' <> repos.did
with no data;
create index export_follow_subject on export_follows (":END_ID");
-- Thanks to `join`, eats up 30GB+ of space while refreshing, but
-- finishes in under an hour.
create materialized view export_likes
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
count(*) as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.like'
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
with no data;
create index export_like_subject on export_likes (":END_ID");
create materialized view export_replies
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
count(*) as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.post'
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
with no data;
create index export_reply_subject on export_replies (":END_ID");
create materialized view export_dids
as select distinct did as "did:ID" from (
select did from repos
union
select distinct ":END_ID" as did from export_follows
union
select distinct ":END_ID" as did from export_likes
union
select distinct ":END_ID" as did from export_replies
)
with no data;

View File

@ -1,10 +0,0 @@
-- Create a block materialized view, don't populate right away.
create materialized view export_blocks
as select repos.did as ":START_ID",
records.content ->> 'subject' as ":END_ID"
from repos join records on repos.id = records.repo
where records.collection = 'app.bsky.graph.block'
and records.content ->> 'subject' <> repos.did
with no data;
create index export_block_subject on export_blocks (":END_ID");

View File

@ -1,16 +0,0 @@
CREATE TABLE incremental_export_log (
id SERIAL PRIMARY KEY,
collection text NOT NULL,
to_tsmp TIMESTAMP NOT NULL,
started TIMESTAMP,
finished TIMESTAMP,
);
CREATE UNIQUE INDEX incremental_export_log_idx on incremental_export_log ("collection", "to_tsmp");
-- manually insert your latest snapshot here
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'app.bsky.graph.follow');
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'app.bsky.feed.like');
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'app.bsky.feed.post');
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'did');
-- insert into incremental_export_log (started, finished, to_tsmp, collection) values ('2024-02-27T05:53:30+00:00', '2024-02-27T07:23:30+00:00', '2024-02-27T05:53:30+00:00', 'handle');

View File

@ -1,16 +0,0 @@
drop materialized view export_dids;
create materialized view export_dids
as select distinct did as "did:ID" from (
select did from repos
union
select distinct ":END_ID" as did from export_follows
union
select distinct ":END_ID" as did from export_likes
union
select distinct ":END_ID" as did from export_replies
union
select distinct ":END_ID" as did from export_blocks
)
with no data;

View File

@ -1,5 +0,0 @@
drop materialized view export_optouts;
create materialized view export_optouts
as select did as "did:ID" from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:plc:qevje4db3tazfbbialrlrkza%'
with no data;

View File

@ -1,62 +0,0 @@
-- Create a bunch of materialized views, but don't populate them right away.
create materialized view export_follows_month
as select repos.did as ":START_ID",
records.content ->> 'subject' as ":END_ID"
from repos join records on repos.id = records.repo
where records.collection = 'app.bsky.graph.follow'
and records.content ->> 'subject' <> repos.did
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
with no data;
create index export_follow_subject_month on export_follows_month (":END_ID");
-- Thanks to `join`, eats up 30GB+ of space while refreshing, but
-- finishes in under an hour.
create materialized view export_likes_month
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
count(*) as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.like'
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
with no data;
create index export_like_subject_month on export_likes_month (":END_ID");
create materialized view export_replies_month
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
count(*) as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.post'
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
with no data;
create index export_reply_subject_month on export_replies_month (":END_ID");
create materialized view export_blocks_month
as select repos.did as ":START_ID",
records.content ->> 'subject' as ":END_ID"
from repos join records on repos.id = records.repo
where records.collection = 'app.bsky.graph.block'
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
and records.content ->> 'subject' <> repos.did
with no data;
create index export_block_subject_month on export_blocks_month (":END_ID");
create materialized view export_dids_month
as select distinct did as "did:ID" from (
select did from repos
union
select distinct ":END_ID" as did from export_follows_month
union
select distinct ":END_ID" as did from export_likes_month
union
select distinct ":END_ID" as did from export_replies_month
union
select distinct ":END_ID" as did from export_blocks_month
)
with no data;

View File

@ -1,47 +0,0 @@
drop materialized view export_dids_ladder;
drop materialized view export_replies_ladder;
drop materialized view export_likes_ladder;
create materialized view export_likes_ladder
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.like'
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
with no data;
create index export_like_subject_ladder on export_likes_ladder (":END_ID");
create materialized view export_replies_ladder
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.post'
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
with no data;
create index export_reply_subject_ladder on export_replies_ladder (":END_ID");
create materialized view export_dids_ladder
as select distinct did as "did:ID" from (
select did from repos
union
select distinct ":END_ID" as did from export_follows
union
select distinct ":END_ID" as did from export_likes_ladder
union
select distinct ":END_ID" as did from export_replies_ladder
union
select distinct ":END_ID" as did from export_blocks
)
with no data;
create index idx_records_created_at on records (created_at);

View File

@ -1,47 +0,0 @@
drop materialized view export_dids_ladder;
drop materialized view export_replies_ladder;
drop materialized view export_likes_ladder;
create materialized view export_likes_ladder
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.like'
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
and repos.did like 'did:%'
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
with no data;
create index export_like_subject_ladder on export_likes_ladder (":END_ID");
create materialized view export_replies_ladder
as select repos.did as ":START_ID",
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '30' DAY) * 10 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '60' DAY) * 5 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '90' DAY) * 3 +
count(*) FILTER (WHERE records.created_at > CURRENT_DATE - INTERVAL '360' DAY) * 1 as "count:long"
from records join repos on records.repo = repos.id
where records.collection = 'app.bsky.feed.post'
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
and repos.did like 'did:%'
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
with no data;
create index export_reply_subject_ladder on export_replies_ladder (":END_ID");
create materialized view export_dids_ladder
as select distinct did as "did:ID" from (
select did from repos
union
select distinct ":END_ID" as did from export_follows
union
select distinct ":END_ID" as did from export_likes_ladder
union
select distinct ":END_ID" as did from export_replies_ladder
union
select distinct ":END_ID" as did from export_blocks
)
with no data;

View File

@ -1,62 +0,0 @@
alter table records rename to records_like;
create table records
(like records_like including defaults)
partition by list (collection);
drop index idx_repo_record_key;
drop index idx_repo_rev;
alter table records_like drop constraint records_pkey;
create unique index records_pkey on records (id, collection);
create table records_default
partition of records default;
create table records_post
partition of records for values in ('app.bsky.feed.post');
create table records_follow
partition of records for values in ('app.bsky.graph.follow');
create table records_block
partition of records for values in ('app.bsky.graph.block');
create table records_repost
partition of records for values in ('app.bsky.feed.repost');
create table records_profile
partition of records for values in ('app.bsky.actor.profile');
ALTER TABLE records_like
ADD CHECK (collection in ('app.bsky.feed.like'));
ALTER TABLE records_post
ADD CHECK (collection in ('app.bsky.feed.post'));
ALTER TABLE records_follow
ADD CHECK (collection in ('app.bsky.graph.follow'));
ALTER TABLE records_repost
ADD CHECK (collection in ('app.bsky.feed.repost'));
ALTER TABLE records_profile
ADD CHECK (collection in ('app.bsky.actor.profile'));
-- SLOW, can run overnight
with moved_rows as (
delete from records_like r
where collection <> 'app.bsky.feed.like'
returning r.*
)
insert into records select * from moved_rows;
-- ULTRA SLOW, DO NOT RUN on large DB
alter table records attach partition records_like for values in ('app.bsky.feed.like');
create index idx_like_subject
on records_like
(split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3));
create index idx_follow_subject
on records_follow
(jsonb_extract_path_text(content, 'subject'));
create index idx_reply_subject
on records_post
(split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3));

View File

@ -1,54 +0,0 @@
alter table records detach partition records_default;
create table records_list
partition of records for values in ('app.bsky.graph.list');
create table records_listblock
partition of records for values in ('app.bsky.graph.listblock');
create table records_listitem
partition of records for values in ('app.bsky.graph.listitem');
ALTER TABLE records_list
ADD CHECK (collection in ('app.bsky.graph.list'));
ALTER TABLE records_listblock
ADD CHECK (collection in ('app.bsky.graph.listblock'));
ALTER TABLE records_listitem
ADD CHECK (collection in ('app.bsky.graph.listitem'));
with moved_rows as (
delete from records_default r
where collection in ('app.bsky.graph.list', 'app.bsky.graph.listblock', 'app.bsky.graph.listitem')
returning r.*
)
insert into records select * from moved_rows;
alter table records attach partition records_default default;
create index listitem_uri_subject
on records_listitem
(
jsonb_extract_path_text(content, 'list'),
jsonb_extract_path_text(content, 'subject'))
include (deleted);
create index listitem_subject_uri
on records_listitem
(
jsonb_extract_path_text(content, 'subject'),
jsonb_extract_path_text(content, 'list'))
include (deleted);
create view listitems as
select *, jsonb_extract_path_text(content, 'list') as list,
jsonb_extract_path_text(content, 'subject') as subject
from records_listitem;
create view lists as
select records_list.*,
jsonb_extract_path_text(content, 'name') as name,
jsonb_extract_path_text(content, 'description') as description,
jsonb_extract_path_text(content, 'purpose') as purpose,
'at://' || repos.did || '/app.bsky.graph.list/' || rkey as uri
from records_list join repos on records_list.repo = repos.id;

View File

@ -1,23 +0,0 @@
DROP VIEW posts;
DROP VIEW lists;
DROP VIEW listitems;
ALTER TABLE "records" ALTER COLUMN "deleted" TYPE boolean USING "deleted"::boolean;
create view posts as
select *, jsonb_extract_path(content, 'langs') as langs,
parse_timestamp(jsonb_extract_path_text(content, 'createdAt')) as content_created_at
from records_post;
create view lists as
select records_list.*,
jsonb_extract_path_text(content, 'name') as name,
jsonb_extract_path_text(content, 'description') as description,
jsonb_extract_path_text(content, 'purpose') as purpose,
'at://' || repos.did || '/app.bsky.graph.list/' || rkey as uri
from records_list join repos on records_list.repo = repos.id;
create view listitems as
select *, jsonb_extract_path_text(content, 'list') as list,
jsonb_extract_path_text(content, 'subject') as subject
from records_listitem;

View File

@ -1,19 +0,0 @@
# See https://docs.docker.com/compose/multiple-compose-files/merge/ for how
# exactly these overrides get applied to the main file.
# tl;dr: strings and numbers get overwritten, lists get concatenated
services:
# Expose PostgreSQL TCP port
postgres:
ports:
- "0.0.0.0:15432:5432"
# Change the default number of indexer threads
record-indexer:
environment:
INDEXER_WORKERS: 15
# Enable PDS discovery via a relay
consumer:
environment:
CONSUMER_RELAYS: "https://bsky.network"

View File

@ -1,11 +1,6 @@
services: services:
postgres: postgres:
# image: "postgres:16" image: "postgres:16"
build:
context: ./docker
dockerfile_inline: |
FROM postgres:16
RUN apt update && apt install -y postgresql-16-partman
volumes: volumes:
- "${DATA_DIR:?specify data dir in .env file}/postgres:/var/lib/postgresql/data:rw" - "${DATA_DIR:?specify data dir in .env file}/postgres:/var/lib/postgresql/data:rw"
restart: always restart: always
@ -21,37 +16,8 @@ services:
environment: environment:
POSTGRES_DB: bluesky POSTGRES_DB: bluesky
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD:?specify password in .env file}" POSTGRES_PASSWORD: "${POSTGRES_PASSWORD:?specify password in .env file}"
command: [
"-c", "max_connections=1000",
"-c", "shared_buffers=8GB",
"-c", "work_mem=2GB",
"-c", "max_parallel_workers_per_gather=8",
"-c", "max_wal_size=8GB",
"-c", "shared_preload_libraries=pg_partman_bgw",
"-c", "pg_partman_bgw.interval=3600",
"-c", "pg_partman_bgw.role=postgres",
"-c", "pg_partman_bgw.dbname=bluesky",
]
shm_size: '16gb'
stop_grace_period: 24h stop_grace_period: 24h
update-db-schema:
build:
context: .
dockerfile: cmd/update-db-schema/Dockerfile
extra_hosts:
- "host.docker.internal:host-gateway"
restart: on-failure
image: uabluerail/update-db-schema
links:
- postgres:db
depends_on:
postgres:
condition: service_healthy
environment:
UPDATE-DB-SCHEMA_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
command: [ "--log-level=0" ]
plc: plc:
build: build:
context: . context: .
@ -69,133 +35,16 @@ services:
depends_on: depends_on:
postgres: postgres:
condition: service_healthy condition: service_healthy
update-db-schema:
condition: service_completed_successfully
environment: environment:
PLC_METRICS_PORT: '8080' PLC_METRICS_PORT: '8080'
PLC_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable" PLC_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
ports: ports:
- "${METRICS_ADDR:-0.0.0.0}:11004:8080" - "0.0.0.0:11004:8080"
command: [ "--log-level=0" ] command: [ "--log-level=0" ]
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/ready"] test: ["CMD", "curl", "-f", "http://localhost:8080/ready"]
interval: 30s interval: 30s
timeout: 5s timeout: 5s
retries: 30 retries: 30
start_period: 6h start_period: 12h
start_interval: 15s start_interval: 15s
lister:
build:
context: .
dockerfile: cmd/lister/Dockerfile
extra_hosts:
- "host.docker.internal:host-gateway"
restart: always
image: uabluerail/repo-lister
deploy:
resources:
limits:
memory: 1G
links:
- postgres:db
- plc:plc
depends_on:
postgres:
condition: service_healthy
plc:
condition: service_healthy
update-db-schema:
condition: service_completed_successfully
environment:
LISTER_METRICS_PORT: '8080'
LISTER_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
ATP_PLC_ADDR: "http://plc:8080"
ports:
- "${METRICS_ADDR:-0.0.0.0}:11001:8080"
command: [ "--log-level=0" ]
consumer:
build:
context: .
dockerfile: cmd/consumer/Dockerfile
extra_hosts:
- "host.docker.internal:host-gateway"
restart: always
image: uabluerail/firehose-consumer
deploy:
resources:
limits:
memory: 1G
links:
- postgres:db
- plc:plc
depends_on:
postgres:
condition: service_healthy
plc:
condition: service_healthy
update-db-schema:
condition: service_completed_successfully
environment:
CONSUMER_METRICS_PORT: '8080'
CONSUMER_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
# CONSUMER_RELAYS: "https://bsky.network" # Effectively doubles inbound network traffic. Set this in docker-compose.override.yml if needed.
ATP_PLC_ADDR: "http://plc:8080"
ports:
- "${METRICS_ADDR:-0.0.0.0}:11002:8080"
command: [ "--log-level=0" ]
record-indexer:
build:
context: .
dockerfile: cmd/record-indexer/Dockerfile
extra_hosts:
- "host.docker.internal:host-gateway"
restart: always
image: uabluerail/record-indexer
deploy:
resources:
limits:
memory: 4G
links:
- postgres:db
- plc:plc
depends_on:
postgres:
condition: service_healthy
plc:
condition: service_healthy
update-db-schema:
condition: service_completed_successfully
dns:
- 1.1.1.1
- 8.8.8.8
environment:
INDEXER_METRICS_PORT: '8080'
INDEXER_POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@db/bluesky?sslmode=disable"
INDEXER_WORKERS: 50
ATP_PLC_ADDR: "http://plc:8080"
ports:
- "${METRICS_ADDR:-0.0.0.0}:11003:8080"
command: [ "--log-level=0" ]
query-exporter:
image: adonato/query-exporter:latest
environment:
POSTGRES_URL: "postgres://postgres:${POSTGRES_PASSWORD}@postgres:5432/bluesky?sslmode=disable"
volumes:
- "./metrics/prometheus/exporters/query-exporter/config.yaml:/config.yaml"
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "${METRICS_ADDR:-0.0.0.0}:9560:9560"
links:
- postgres:postgres
depends_on:
postgres:
condition: service_healthy
update-db-schema:
# Not a strict dependency, but it's better to not have it running
# unnecessary queries during a costly migration.
condition: service_completed_successfully

View File

@ -1 +0,0 @@
*

15
du.sql
View File

@ -1,15 +0,0 @@
SELECT
relname as table_name,
pg_size_pretty(pg_total_relation_size(relid)) As "Total Size",
pg_size_pretty(pg_indexes_size(relid)) as "Index Size",
pg_size_pretty(pg_table_size(relid)) as "Actual Size"
FROM pg_catalog.pg_statio_user_tables
ORDER BY pg_total_relation_size(relid) DESC;
SELECT
relname as table_name,
indexrelname as index_name,
pg_size_pretty(pg_table_size(indexrelid)) as "Index Size"
FROM pg_catalog.pg_statio_user_indexes
ORDER BY pg_table_size(indexrelid) DESC;

View File

@ -1,9 +1,2 @@
POSTGRES_PASSWORD='some password' POSTGRES_PASSWORD='some password'
DATA_DIR= DATA_DIR=
CSV_DIR=
# IP address to expose HTTP ports on
METRICS_ADDR=0.0.0.0
# Grafana URL with username and password. Only needed if you're going to import the dashboard.
#GRAFANA_URL="https://<username>:<password>@<hostname>"

View File

@ -1,97 +0,0 @@
# Graceful shutdown/restart
`docker compose stop lister`
`docker compose stop consumer`
`docker compose stop record-indexer`
Take a look at grafana, once all quiet
`docker compose stop postgres`
Start everything up
`docker compose up -d --build`
# Control number of workers
Full throttle
`curl 'localhost:11003/pool/resize?size=50'`
Half throttle (recommended)
`curl 'localhost:11003/pool/resize?size=25'`
Stop eating all of my Internet
`curl 'localhost:11003/pool/resize?size=10'`
# Peak into db
`docker compose exec -it postgres psql -U postgres -d bluesky`
Seen repos
`select count(*) from repos;`
Fully indexed repos
`select count(*) from repos where last_indexed_rev <> '' and (last_indexed_rev >= first_rev_since_reset or first_rev_since_reset is null or first_rev_since_reset = '');`
Get list blocks
non-partitioned (very slow)
```
select count(*) from (select distinct repo from records where collection in ('app.bsky.graph.listblock') and deleted=false and content['subject']::text like '"at://did:plc:bmjomljebcsuxolnygfgqtap/%');
```
partitioned (slow)
`select count(*) from (select distinct repo from records_listblock where deleted=false and content['subject']::text like '"at:///%');`
`select count(*) from (select distinct repo from records_listblock where deleted=false and (split_part(jsonb_extract_path_text(content, 'subject'), '/', 3))='did:plc:bmjomljebcsuxolnygfgqtap');`
Count all records
`analyze records; select relname, reltuples::int from pg_class where relname like 'records';`
View errors
`select last_error, count(*) from repos where failed_attempts > 0 group by last_error;`
Restart errors
`update repos set failed_attempts=0, last_error='' where failed_attempts >0;`
# MONITORING
More verbose logging for queries DEBUG1-DEBUG5
`set client_min_messages = 'DEBUG5';`
Take a look at slow queries
```
SELECT pid, age(clock_timestamp(), query_start), state, query
FROM pg_stat_activity
WHERE query != '<IDLE>' AND query NOT ILIKE '%pg_stat_activity%'
ORDER BY query_start asc;
```
Monitor index progress
`select * from pg_stat_progress_create_index;`
Explore new collection types
```
select * from records where collection not in (
'app.bsky.actor.profile',
'app.bsky.feed.generator',
'app.bsky.feed.like',
'app.bsky.feed.post',
'app.bsky.feed.repost',
'app.bsky.feed.threadgate',
'app.bsky.graph.block',
'app.bsky.graph.follow',
'app.bsky.graph.listitem',
'app.bsky.graph.list',
'app.bsky.graph.listblock'
) limit 20;
```
count listitems
`select count(*) from listitems where list='at://did:plc:2yqylcqgxier4l5uplp6w6jh/app.bsky.graph.list/3kkud7l6s4v2m';`

View File

@ -1,29 +0,0 @@
{
"$type": "app.bsky.actor.profile",
"avatar": {
"ref": {
"/": "bafkreihcxxwlssxseaxa2dclcci3l6qpnhy25igjqmqhig44iddlxneymm"
},
"size": 169608,
"$type": "blob",
"mimeType": "image/jpeg"
},
"banner": {
"ref": {
"/": "bafkreiejgeq5mo4kxx5s4t3jpmxxr3kirdgv7ozkvfm4hfh3p7eaow6xyu"
},
"size": 272387,
"$type": "blob",
"mimeType": "image/jpeg"
},
"labels": {
"$type": "com.atproto.label.defs#selfLabels",
"values": [
{
"val": "!no-unauthenticated"
}
]
},
"description": "Full time parent, part time gamer. [He/Him]\n\nNeurodivergent [AuDHD] Demi 🦜\n\nSci-fi/Fantasy, video games, music, cats, anime, horror, crows, clothing, and human cognition are my jam. Not the extent of my interests - just what I'm willing to admit 😁 [NSFW]",
"displayName": "Flux 🤍🩶💜🖤"
}

View File

@ -1,45 +0,0 @@
{
"did": "did:web:skyfeed.me",
"$type": "app.bsky.feed.generator",
"createdAt": "2024-02-11T18:10:26.365Z",
"description": "絵描きさんと繋がりたい\n創作クラスタさんと繋がりたい\nクラスタフォロー\nの単語が含まれているPostのフィードです",
"displayName": "絵描きさん探し",
"skyfeedBuilder": {
"blocks": [
{
"id": "aaajsgtnqrcm6",
"did": "did:plc:l425td4tg5lq7y5gsrvfyhp5",
"type": "input",
"inputType": "firehose",
"firehoseSeconds": 604800
},
{
"id": "aaajsgtnqqgya",
"type": "remove",
"subject": "language",
"language": "ja"
},
{
"id": "aaajsgtnqqobo",
"type": "regex",
"value": "絵描きさんと繋がりたい|創作クラスタさんと繋がりたい|クラスタフォロー",
"target": "text|alt_text",
"caseSensitive": false
},
{
"id": "aaajsrd2o422c",
"type": "remove",
"value": "0",
"subject": "image_count"
},
{
"id": "aaajsgtnqsjne",
"type": "sort",
"sortType": "created_at",
"sortDirection": "desc"
}
],
"license": "EUPL-1.2",
"displayName": "絵描きさん探し"
}
}

View File

@ -1,8 +0,0 @@
{
"$type": "app.bsky.feed.like",
"subject": {
"cid": "bafyreiacuywksad5m72btsueyedsirbfamtovqdfdof2ulg2io7oofziv4",
"uri": "at://did:plc:iq5uninsn3ovpycv7rkth3ik/app.bsky.feed.post/3kjbob2uwra25"
},
"createdAt": "2024-02-06T15:23:11.641Z"
}

View File

@ -1,18 +0,0 @@
{
"text": "Чесно кажучи це один з найкращих епізодів, на ряду з безсмертним другом",
"$type": "app.bsky.feed.post",
"langs": [
"uk"
],
"reply": {
"root": {
"cid": "bafyreienbpdlpqqwcovc56lgao2botzjleuqjocitapq5f2eficz2j2hdy",
"uri": "at://did:plc:wymxmgvtvzuumvldtnz76aez/app.bsky.feed.post/3kjr2i3gsl22v"
},
"parent": {
"cid": "bafyreienbpdlpqqwcovc56lgao2botzjleuqjocitapq5f2eficz2j2hdy",
"uri": "at://did:plc:wymxmgvtvzuumvldtnz76aez/app.bsky.feed.post/3kjr2i3gsl22v"
}
},
"createdAt": "2024-01-24T23:37:39.813Z"
}

View File

@ -1,8 +0,0 @@
{
"$type": "app.bsky.feed.repost",
"subject": {
"cid": "bafyreiglj4rlihlxraqqr7wvea2zybrk3ddugwk42qwiemwrytvnchc4hy",
"uri": "at://did:plc:zvouh5woyfppe4gp6er354dl/app.bsky.feed.post/3kj3gjrmn7r2o"
},
"createdAt": "2024-01-16T12:28:45.555Z"
}

View File

@ -1,6 +0,0 @@
{
"post": "at://did:plc:gfrzrhrhzfrocrqnnutnuhk4/app.bsky.feed.post/3kgbzbuxh462c",
"$type": "app.bsky.feed.threadgate",
"allow": [],
"createdAt": "2023-12-11T18:07:47.314Z"
}

View File

@ -1,5 +0,0 @@
{
"$type": "app.bsky.graph.block",
"subject": "did:plc:yp6otbdle4znllf2wxf5vrzx",
"createdAt": "2023-11-16T17:20:56.410Z"
}

View File

@ -1,5 +0,0 @@
{
"$type": "app.bsky.graph.follow",
"subject": "did:plc:cwcgqihgua35pkw6j4iqvv7o",
"createdAt": "2023-09-19T13:31:21.477Z"
}

View File

@ -1,24 +0,0 @@
[
{
"name": "絵師",
"$type": "app.bsky.graph.list",
"purpose": "app.bsky.graph.defs#curatelist",
"createdAt": "2024-02-07T03:28:14.317Z",
"description": ""
},
{
"name": "AI Art Bros",
"$type": "app.bsky.graph.list",
"avatar": {
"ref": {
"/": "bafkreicsl3dkam2uswcmck3j7xt7nvqwuxnpwydnkqeu5ncsgf22er46iu"
},
"size": 58526,
"$type": "blob",
"mimeType": "image/jpeg"
},
"purpose": "app.bsky.graph.defs#modlist",
"createdAt": "2024-02-07T21:29:36.219Z",
"description": "A list of ai art makers. We like to put our soul in our art. Get outta here! This list will expand overtime as I catch em >:D"
}
]

View File

@ -1,5 +0,0 @@
{
"$type": "app.bsky.graph.listblock",
"subject": "at://did:plc:aobkgz6khzavtdmd5ng3ilme/app.bsky.graph.list/3k6xopgz3xc23",
"createdAt": "2024-01-05T23:19:05.067Z"
}

View File

@ -1,6 +0,0 @@
{
"list": "at://did:plc:nqouwdgddza2z3vwlxs73t4x/app.bsky.graph.list/3kksd6ja5622n",
"$type": "app.bsky.graph.listitem",
"subject": "did:plc:rd5rkwppbfxlfegsyddk24oz",
"createdAt": "2024-02-07T03:28:23.842Z"
}

View File

@ -1,22 +0,0 @@
.PHONY: all build up update down
# ---------------------------- Docker ----------------------------
all:
go test -v ./...
.env:
@cp example.env .env
@echo "Please edit .env to suit your environment before proceeding"
@exit 1
build: .env
@docker compose build
up: .env
@docker compose up -d --build
update: up
down:
@docker compose down

View File

@ -1,17 +0,0 @@
# To start prometheus + grafana
`cd metrics`
`docker compose up -d --build`
### Note: remember to allow ports for Prometheus to see host.docker.internal:xxxx from within container
Lister, consumer, indexer
`sudo ufw allow 11001`
`sudo ufw allow 11002`
`sudo ufw allow 11003`
Postgres
`sudo ufw allow 15432`
# Go to `metrics/prometheus/exporters` and install node and query exporters

View File

@ -1,25 +0,0 @@
version: '3.8'
services:
prometheus:
image: prom/prometheus
# needed if mounted in custom volume
user: root
volumes:
- "./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml"
- "${PROMETHEUS_DATA_DIR:?specify data dir in .env file}:/prometheus"
restart: always
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- 9090:9090
grafana:
build:
context: ./grafana
user: root
restart: always
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- 9000:3000
volumes:
- ${GRAFANA_DATA_DIR:?specify data dir in .env file}:/var/lib/grafana

View File

@ -1,17 +0,0 @@
FROM grafana/grafana:latest
# Disable Login form or not
ENV GF_AUTH_DISABLE_LOGIN_FORM "true"
# Allow anonymous authentication or not
ENV GF_AUTH_ANONYMOUS_ENABLED "true"
# Role of anonymous user
ENV GF_AUTH_ANONYMOUS_ORG_ROLE "Admin"
# Install plugins here our in your own config file
# ENV GF_INSTALL_PLUGINS="<list of plugins seperated by ,"
# Add provisioning
ADD ./provisioning /etc/grafana/provisioning
# Add configuration file
ADD ./grafana.ini /etc/grafana/grafana.ini
# Add dashboard json files
ADD ./dashboards /etc/grafana/dashboards

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +0,0 @@
[paths]
provisioning = /etc/grafana/provisioning
[server]
enable_gzip = true
# To add HTTPS support:
#protocol = https
#;http_addr =
#http_port = 3000
#domain = localhost
#enforce_domain = false
#root_url = https://localhost:3000
#router_logging = false
#static_root_path = public
#cert_file = /etc/certs/cert.pem
#cert_key = /etc/certs/cert-key.pem
[security]
# If you want to embed grafana into an iframe for example
allow_embedding = true
[users]
default_theme = dark

View File

@ -1,14 +0,0 @@
apiVersion: 1
contactPoints:
- orgId: 1
name: Uabluerail Discord
receivers:
- uid: edg8hyxtl9s74f
type: discord
settings:
avatar_url: https://cdn.bsky.app/img/avatar/plain/did:plc:ohvstchboonnmbplvwkl33ko/bafkreibyw6gw5ix6p7uerwurrmimrc3nfxwdba3ainto36kjv3ywhdkjdq@jpeg
# message: message template
# title: title template
url: https://discord.com/api/webhooks/1203054943578226709/lt1thL_pKzfG9fgA7reslqV1iaq9L2uYFxRIBJzxot8GAF1NicvWYHEOeMGKeQQOeOB9
use_discord_username: false
disableResolveMessage: false

View File

@ -1,25 +0,0 @@
# config file version
apiVersion: 1
providers:
# <string> an unique provider name
- name: My Dashboard
# <int> org id. will default to orgId 1 if not specified
org_id: 1
# <string, required> name of the dashboard folder. Required
folder: ''
# <string, required> provider type. Required
type: 'file'
# <bool> disable dashboard deletion
disableDeletion: false
# <bool> enable dashboard editing
editable: true
# <int> how often Grafana will scan for changed dashboards
updateIntervalSeconds: 5
# <bool> allow updating provisioned dashboards from the UI
allowUiUpdates: true
options:
# <string, required> path to dashboard files on disk. Required
path: /etc/grafana/dashboards
# <bool> use folder names from filesystem to create folders in Grafana
foldersFromFilesStructure: true

View File

@ -1,25 +0,0 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
# Access mode - proxy (server in the UI) or direct (browser in the UI).
url: http://prometheus:9090
jsonData:
httpMethod: POST
manageAlerts: true
prometheusType: Prometheus
prometheusVersion: 2.49.0
cacheLevel: 'High'
disableRecordingRules: false
incrementalQueryOverlapWindow: 10m
exemplarTraceIdDestinations:
# Field with internal link pointing to data source in Grafana.
# datasourceUid value can be anything, but it should be unique across all defined data source uids.
- datasourceUid: 000000001
name: traceID
# Field with external link.
- name: traceID
url: 'http://host.docker.internal:3000/explore?orgId=1&left=%5B%22now-1h%22,%22now%22,%22Jaeger%22,%7B%22query%22:%22$${__value.raw}%22%7D%5D'

View File

@ -1,136 +0,0 @@
# Install Node-exporter
You'll need to install node exporter for monitoring
1. Download Node Exporter
As first step, you need to download the Node Exporter binary which is available for Linux in the official Prometheus website here. In the website, you will find a table with the list of available builds. Of our interest in this case, is the node_exporter build for Linux AMD64:
Node Exporter Ubuntu Linux
In this case the latest available version is the 1.7.0. Copy the .tar.gz URL and download it somewhere in your server using wget or cURL:
`wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz`
2. Extract Node Exporter and move binary
After downloading the latest version of Node Exporter, proceed to extract the content of the downloaded tar using the following command:
`tar xvf node_exporter-1.7.0.linux-amd64.tar.gz`
The content of the zip will be extracted in the current directory, the extracted directory will contain 3 files:
LICENSE (license text file)
node_exporter (binary)
NOTICE (license text file)
You only need to move the binary file node_exporter to the /usr/local/bin directory of your system. Switch to the node_exporter directory:
`cd node_exporter-1.7.0.linux-amd64`
And then copy the binary file with the following command:
`sudo cp node_exporter /usr/local/bin`
Then you can remove the directory that we created after extracting the zip file content:
# Exit current directory
`cd ..`
# Remove the extracted directory
`rm -rf ./node_exporter-1.7.0.linux-amd64`
3. Create Node Exporter User
As a good practice, create an user in the system for Node Exporter:
`sudo useradd --no-create-home --shell /bin/false node_exporter`
And set the owner of the binary node_exporter to the recently created user:
`sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter`
4. Create and start the Node Exporter service
The Node Exporter service should always start when the server boots so it will always be available to be scrapped for information. Create the node_exporter.service file with nano:
`sudo nano /etc/systemd/system/node_exporter.service`
And paste the following content in the file:
```
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/prometheus/node-exporter/
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
```
Close nano and save the changes to the file. Proceed to reload the daemon with:
`sudo systemctl daemon-reload`
And finally enable the node_exporter service with the following command:
`sudo systemctl enable node_exporter`
And then start the service:
`sudo systemctl start node_exporter`
`sudo ufw allow 9090`
`sudo ufw allow 9100`
now go to `http://localhost:9100/metrics`
# Install query exporter
Query-exporter is started in indexer's docker.
To allow viewing it on local network:
`cd exporters`
`sudo ufw allow 9560`
# Install smartmon
`sudo apt install prometheus-node-exporter-collectors smartmontools`
Check if your SSD is compatible (your device name may differ, mine is /dev/sda)
`sudo smartctl -i /dev/sda`
Enable SMART on your SSD
`sudo smartctl -s on /dev/sda`
Check smartmon is configured correctly
`sudo nano /lib/systemd/system/prometheus-node-exporter-smartmon.service`
It should be like this:
```
[Unit]
Description=Collect SMART metrics for prometheus-node-exporter
[Service]
Type=oneshot
Environment=TMPDIR=/var/lib/prometheus/node-exporter
ExecStart=/bin/bash -c "/usr/share/prometheus-node-exporter-collectors/smartmon.sh | sponge /var/lib/prometheus/node-exporter/smartmon.prom"
```
Start the service
`systemctl start prometheus-node-exporter-smartmon.service`
Open node exporter
`sudo nano /etc/systemd/system/node_exporter.service`
Check it contains the `--collector.textfile.directory` parameter
```
ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/prometheus/node-exporter/
```
Start the smartmon service
`systemctl start prometheus-node-exporter-smartmon.service`
Check the file contains correct metrics
`nano /var/lib/prometheus/node-exporter/smartmon.prom`
Restart the node exporter
`sudo systemctl restart node_exporter`
Metrics should display on `http://localhost:9100/metrics`

View File

@ -1,71 +0,0 @@
databases:
db1:
dsn: env:POSTGRES_URL
metrics:
repos_fully_indexed:
type: gauge
description: Repositories fully indexed
repos_seen:
type: gauge
description: Repositories seen
repos_failed:
type: gauge
description: Repositories that we failed to index
consumer_bad_records:
type: gauge
description: Records received from firehose that we failed to process
labels: [pds, error]
# posts_lang:
# type: summary
# description: Posts by language
# labels: [uk, lt, et, lv, pl, ga, fi, sv,
# en, jp, de, fr, pt, es, nl, ko, tr, zh, ru]
queries:
query1:
interval: 30
databases: [db1]
metrics: [repos_fully_indexed]
sql: >
select count(*) as repos_fully_indexed
from repos left join pds on repos.pds = pds.id
where failed_attempts < 3
and last_indexed_rev <> ''
and (last_indexed_rev >= first_rev_since_reset
or first_rev_since_reset is null or first_rev_since_reset = '')
and (repos.first_cursor_since_reset >= pds.first_cursor_since_reset
or repos.first_cursor_since_reset is null or repos.first_cursor_since_reset = 0);
query2:
interval: 30
databases: [db1]
metrics: [repos_seen]
sql: select count(*) as repos_seen from repos;
query3:
interval: 30
databases: [db1]
metrics: [repos_failed]
sql: select count(*) as repos_failed from repos where failed_attempts >= 3;
# query4:
# interval: 300
# databases: [db1]
# metrics: [posts_lang]
# sql: select count(*) as uk from records where collection in ('app.bsky.feed.post') and content::text like '%"langs": ["uk"]%';
bad_records:
interval: 30
databases: [db1]
metrics: [consumer_bad_records]
sql: |
select count(*) as consumer_bad_records, host as pds, error
from (
select id, created_at, pds, cursor, content,
regexp_replace(regexp_replace(regexp_replace(error,
'did:[\:a-z0-9]+', 'did:xxx', 'g'),
'json\.RawMessage\{[^}]+\}', 'json.RawMessage{...}', 'g'),
'[0-9]{1,3}(\.[0-9]{1,3}){3}\:[0-9]+', '<IP>\:<port>', 'g') as error
from bad_records
)
join
pds
on pds=pds.id
group by error, host;

View File

@ -1 +0,0 @@
POSTGRES_PASSWORD='your password'

View File

@ -1,16 +0,0 @@
global:
scrape_interval: 10s
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- host.docker.internal:9090
- job_name: node
static_configs:
- targets: ['host.docker.internal:9100']
- job_name: indexer
static_configs:
- targets: [ host.docker.internal:11001, host.docker.internal:11002, host.docker.internal:11003 ]
- job_name: db
static_configs:
- targets: ['host.docker.internal:9560']

View File

@ -1,56 +0,0 @@
package pds
import (
"context"
"fmt"
"path/filepath"
"time"
"gorm.io/gorm"
"github.com/uabluerail/indexer/models"
)
const Unknown models.ID = 0
var whitelist []string = []string{
"https://bsky.social",
"https://*.bsky.network",
"https://*",
}
type PDS struct {
ID models.ID `gorm:"primarykey"`
CreatedAt time.Time
UpdatedAt time.Time
Host string `gorm:"uniqueIndex"`
Cursor int64
FirstCursorSinceReset int64
LastList time.Time
CrawlLimit int
Disabled bool
}
func AutoMigrate(db *gorm.DB) error {
return db.AutoMigrate(&PDS{})
}
func EnsureExists(ctx context.Context, db *gorm.DB, host string) (*PDS, error) {
if !IsWhitelisted(host) {
return nil, fmt.Errorf("host %q is not whitelisted", host)
}
remote := PDS{Host: host}
if err := db.Model(&remote).Where(&PDS{Host: host}).FirstOrCreate(&remote).Error; err != nil {
return nil, fmt.Errorf("failed to get PDS record from DB for %q: %w", remote.Host, err)
}
return &remote, nil
}
func IsWhitelisted(host string) bool {
for _, p := range whitelist {
if match, _ := filepath.Match(p, host); match {
return true
}
}
return false
}

View File

@ -1,352 +0,0 @@
package repo
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"time"
"github.com/rs/zerolog"
"github.com/ipfs/go-cid"
"github.com/ipld/go-car"
"github.com/ipld/go-ipld-prime/codec/dagcbor"
"github.com/ipld/go-ipld-prime/codec/dagjson"
"github.com/ipld/go-ipld-prime/datamodel"
"github.com/ipld/go-ipld-prime/node/basicnode"
)
var ErrInvalidSignature = fmt.Errorf("commit signature is not valid")
func ExtractRecords(ctx context.Context, b io.Reader, signingKey string) (map[string]json.RawMessage, error) {
log := zerolog.Ctx(ctx)
r, err := car.NewCarReader(b)
if err != nil {
return nil, fmt.Errorf("failed to construct CAR reader: %w", err)
}
blocks := map[cid.Cid][]byte{}
for {
block, err := r.Next()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, fmt.Errorf("reading next block: %w", err)
}
c, err := block.Cid().Prefix().Sum(block.RawData())
if err != nil {
return nil, fmt.Errorf("failed to calculate CID from content")
}
if c.Equals(block.Cid()) {
blocks[block.Cid()] = block.RawData()
} else {
log.Debug().Str("cid", block.Cid().String()).
Msgf("CID doesn't match block content: %s != %s", block.Cid().String(), c.String())
}
}
records := map[string]cid.Cid{}
if len(r.Header.Roots) == 0 {
return nil, fmt.Errorf("CAR has zero roots specified")
}
// https://atproto.com/specs/repository specifies that the first root
// must be a commit object. Meaning of subsequent roots is not yet defined.
root := r.Header.Roots[0]
// TODO: verify that a root is a commit record and validate signature
if _, found := blocks[root]; !found {
return nil, fmt.Errorf("root block is missing")
}
valid, err := verifyCommitSignature(ctx, blocks[root], signingKey)
if err != nil {
return nil, fmt.Errorf("commit signature verification failed: %w", err)
}
if !valid {
return nil, ErrInvalidSignature
}
cids, err := findRecords(blocks, root, nil, nil, 0)
if err != nil {
return nil, err
}
for k, v := range cids {
records[k] = v
}
res := map[string]json.RawMessage{}
for k, c := range records {
builder := basicnode.Prototype.Any.NewBuilder()
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(blocks[c])); err != nil {
return nil, fmt.Errorf("unmarshaling %q: %w", c.String(), err)
}
w := bytes.NewBuffer(nil)
if err := (dagjson.EncodeOptions{EncodeLinks: true, EncodeBytes: true}).Encode(builder.Build(), w); err != nil {
return nil, fmt.Errorf("marshaling %q as JSON: %w", c.String(), err)
}
res[k] = w.Bytes()
}
return res, nil
}
const maxDepth = 128
func findRecords(blocks map[cid.Cid][]byte, root cid.Cid, key []byte, visited map[cid.Cid]bool, depth int) (map[string]cid.Cid, error) {
if depth > maxDepth {
return nil, fmt.Errorf("reached maximum depth at %q", root.String())
}
if visited == nil {
visited = map[cid.Cid]bool{}
}
visited[root] = true
builder := basicnode.Prototype.Any.NewBuilder()
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(blocks[root])); err != nil {
return nil, fmt.Errorf("unmarshaling %q: %w", root.String(), err)
}
node := builder.Build()
if node.Kind() != datamodel.Kind_Map {
return nil, nil
}
m, err := parseMap(node)
if err != nil {
return nil, err
}
if _, ok := m["$type"]; ok {
return map[string]cid.Cid{string(key): root}, nil
}
if d, ok := m["data"]; ok {
// Commit record
if d.Kind() == datamodel.Kind_Link {
l, _ := d.AsLink()
if l != nil {
c, err := cid.Parse([]byte(l.Binary()))
if err != nil {
return nil, fmt.Errorf("failed to parse %q as CID: %w", l.String(), err)
}
if _, ok := blocks[c]; ok && !visited[c] {
return findRecords(blocks, c, nil, visited, depth+1)
}
}
}
return nil, nil
}
if entries, ok := m["e"]; ok {
// MST node
r := map[string]cid.Cid{}
iter := entries.ListIterator()
key = []byte{}
for !iter.Done() {
_, item, err := iter.Next()
if err != nil {
return nil, fmt.Errorf("failed to read the next list item in block %q: %w", root.String(), err)
}
if item.Kind() != datamodel.Kind_Map {
continue
}
m, err := parseMap(item)
if err != nil {
return nil, err
}
for _, field := range []string{"k", "p", "v", "t"} {
if _, ok := m[field]; !ok {
return nil, fmt.Errorf("TreeEntry is missing field %q", field)
}
}
prefixLen, err := m["p"].AsInt()
if err != nil {
return nil, fmt.Errorf("m[\"p\"].AsInt(): %w", err)
}
prefixPart, err := m["k"].AsBytes()
if err != nil {
return nil, fmt.Errorf("m[\"k\"].AsBytes(): %w", err)
}
val, err := m["v"].AsLink()
if err != nil {
return nil, fmt.Errorf("m[\"v\"].AsLink(): %w", err)
}
c, err := cid.Parse([]byte(val.Binary()))
if err != nil {
return nil, fmt.Errorf("failed to parse %q as CID: %w", val.String(), err)
}
if len(key) == 0 {
// First entry, must have a full key.
if prefixLen != 0 {
return nil, fmt.Errorf("incomplete key in the first entry")
}
key = prefixPart
}
if prefixLen > int64(len(key)) {
return nil, fmt.Errorf("specified prefix length is larger than the key length: %d > %d", prefixLen, len(key))
}
key = append(key[:prefixLen], prefixPart...)
if _, ok := blocks[c]; ok && !visited[c] {
results, err := findRecords(blocks, c, key, visited, depth+1)
if err != nil {
return nil, err
}
for k, v := range results {
r[k] = v
}
}
if m["t"] != nil && m["t"].Kind() == datamodel.Kind_Link {
subtree, err := m["t"].AsLink()
if err != nil {
return nil, fmt.Errorf("m[\"t\"].AsLink(): %w", err)
}
subtreeCid, err := cid.Parse([]byte(subtree.Binary()))
if err != nil {
return nil, fmt.Errorf("failed to parse %q as CID: %w", val.String(), err)
}
if _, ok := blocks[subtreeCid]; ok && !visited[subtreeCid] {
results, err := findRecords(blocks, subtreeCid, key, visited, depth+1)
if err != nil {
return nil, err
}
for k, v := range results {
r[k] = v
}
}
}
}
left, ok := m["l"]
if ok && left.Kind() == datamodel.Kind_Link {
l, _ := left.AsLink()
if l != nil {
c, err := cid.Parse([]byte(l.Binary()))
if err != nil {
return nil, fmt.Errorf("failed to parse %q as CID: %w", l.String(), err)
}
if _, ok := blocks[c]; ok && !visited[c] {
results, err := findRecords(blocks, c, nil, visited, depth+1)
if err != nil {
return nil, err
}
for k, v := range results {
r[k] = v
}
}
}
}
return r, nil
}
return nil, fmt.Errorf("unrecognized block %q", root.String())
}
func parseMap(node datamodel.Node) (map[string]datamodel.Node, error) {
if node.Kind() != datamodel.Kind_Map {
return nil, fmt.Errorf("not a map")
}
m := map[string]datamodel.Node{}
iter := node.MapIterator()
for !iter.Done() {
k, v, err := iter.Next()
if err != nil {
return nil, fmt.Errorf("iterating over map fields: %w", err)
}
if k.Kind() != datamodel.Kind_String {
continue
}
ks, _ := k.AsString()
m[ks] = v
}
return m, nil
}
var ErrZeroBlocks = fmt.Errorf("zero blocks found")
func GetRev(ctx context.Context, b io.Reader) (string, error) {
r, err := car.NewCarReader(b)
if err != nil {
return "", fmt.Errorf("failed to construct CAR reader: %w", err)
}
if len(r.Header.Roots) == 0 {
return "", fmt.Errorf("no roots specified in CAR header")
}
blocks := map[cid.Cid][]byte{}
for {
block, err := r.Next()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return "", fmt.Errorf("reading next block: %w", err)
}
c, err := block.Cid().Prefix().Sum(block.RawData())
if err != nil {
return "", fmt.Errorf("failed to calculate CID from content")
}
if c.Equals(block.Cid()) {
blocks[block.Cid()] = block.RawData()
}
}
if len(blocks) == 0 {
return "", ErrZeroBlocks
}
builder := basicnode.Prototype.Any.NewBuilder()
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(blocks[r.Header.Roots[0]])); err != nil {
return "", fmt.Errorf("unmarshaling %q: %w", r.Header.Roots[0].String(), err)
}
node := builder.Build()
v, err := node.LookupByString("rev")
if err != nil {
return "", fmt.Errorf("looking up 'rev' field: %w", err)
}
s, err := v.AsString()
if err != nil {
return "", fmt.Errorf("rev.AsString(): %w", err)
}
return s, nil
}
func GetLang(ctx context.Context, value json.RawMessage) ([]string, time.Time, error) {
var content struct {
Type string `json:"$type"`
Langs []string `json:"langs"`
Time string `json:"createdAt"`
}
err := json.Unmarshal([]byte(value), &content)
if err != nil {
return nil, time.Now(), fmt.Errorf("failed to extract lang from content: %w", err)
}
if content.Type != "app.bsky.feed.post" {
return nil, time.Now(), errors.New("not a post")
}
var timestamp time.Time
if t, err := time.Parse(time.RFC3339, content.Time); err != nil {
return nil, time.Now(), fmt.Errorf("failed to extract time from content: %w", err)
} else {
timestamp = t
}
return content.Langs, timestamp, nil
}

View File

@ -1,105 +0,0 @@
package repo
import (
"context"
"encoding/json"
"errors"
"fmt"
"time"
"gorm.io/gorm"
"github.com/uabluerail/indexer/models"
"github.com/uabluerail/indexer/pds"
"github.com/uabluerail/indexer/util/resolver"
)
type Repo struct {
ID models.ID `gorm:"primarykey"`
CreatedAt time.Time
UpdatedAt time.Time
PDS models.ID `gorm:"default:0;index:rev_state_index,priority:2;index:was_indexed,priority:2"`
DID string `gorm:"uniqueIndex;column:did"`
LastIndexedRev string `gorm:"index:rev_state_index,expression:(last_indexed_rev < first_rev_since_reset),priority:1;index:was_indexed,expression:(last_indexed_rev is null OR last_indexed_rev = ''),priority:1"`
FirstRevSinceReset string
LastFirehoseRev string
FirstCursorSinceReset int64
TombstonedAt time.Time
LastIndexAttempt time.Time
LastError string
FailedAttempts int `gorm:"default:0"`
LastKnownKey string
}
type Record struct {
ID models.ID
CreatedAt time.Time `gorm:"not null"`
UpdatedAt time.Time `gorm:"autoUpdateTime:false"`
Repo models.ID `gorm:"index:idx_repo_record_key,priority:1;not null;index:idx_repo_rev"`
Collection string `gorm:"index:idx_repo_record_key,priority:2;not null"`
Rkey string `gorm:"index:idx_repo_record_key,priority:3"`
AtRev string `gorm:"index:idx_repo_rev"`
Content json.RawMessage `gorm:"type:JSONB"`
Deleted bool `gorm:"default:false"`
}
func AutoMigrate(db *gorm.DB) error {
return db.AutoMigrate(&Repo{}, &Record{})
}
func EnsureExists(ctx context.Context, db *gorm.DB, did string) (*Repo, bool, error) {
r := Repo{}
if err := db.Model(&r).Where(&Repo{DID: did}).Take(&r).Error; err == nil {
// Already have a row, just return it.
return &r, false, nil
} else {
if !errors.Is(err, gorm.ErrRecordNotFound) {
return nil, false, fmt.Errorf("querying DB: %w", err)
}
}
// No row yet, so we need to create one (keeping in mind that it can be created
// concurrently by someone else).
// 1) resolve did (i.e., query PLC)
// 2) get PDS address from didDoc and ensure we have a record for it
// 3) in a transaction, check if we have a record for the repo
// if we don't - just create a record
// if we do - compare PDS IDs
// if they don't match - also reset FirstRevSinceReset
u, pubKey, err := resolver.GetPDSEndpointAndPublicKey(ctx, did)
if err != nil {
return nil, false, fmt.Errorf("fetching DID Document: %w", err)
}
if u.Path == "/" {
// Discard inginificant path to avoid string comparison mismatches,
// as well as glob pattern false negatives.
u.Path = ""
}
remote, err := pds.EnsureExists(ctx, db, u.String())
if err != nil {
return nil, false, fmt.Errorf("failed to get PDS record from DB for %q: %w", u.String(), err)
}
r = Repo{
DID: did,
PDS: models.ID(remote.ID),
LastKnownKey: pubKey,
}
created := false
err = db.Transaction(func(tx *gorm.DB) error {
result := tx.Model(&r).Where(&Repo{DID: r.DID}).FirstOrCreate(&r)
if err := result.Error; err != nil {
return fmt.Errorf("looking for repo: %w", err)
}
if r.PDS != models.ID(remote.ID) {
return tx.Model(&r).Select("FirstRevSinceReset").Updates(&Repo{FirstRevSinceReset: ""}).Error
}
created = result.RowsAffected > 0
return nil
})
if err != nil {
return nil, false, fmt.Errorf("upserting repo record: %w", err)
}
return &r, created, nil
}

View File

@ -1,223 +0,0 @@
package repo
import (
"bytes"
"context"
"crypto"
"crypto/ecdsa"
"crypto/elliptic"
"crypto/sha256"
"encoding/binary"
"fmt"
"io"
"math/big"
"github.com/rs/zerolog"
"gitlab.com/yawning/secp256k1-voi/secec"
"github.com/ipfs/go-cid"
"github.com/ipld/go-ipld-prime/codec/dagcbor"
"github.com/ipld/go-ipld-prime/datamodel"
"github.com/ipld/go-ipld-prime/node/basicnode"
"github.com/multiformats/go-multibase"
"github.com/multiformats/go-multicodec"
)
type SignatureValidator func(digest []byte, sig []byte) (bool, error)
func parseSigningKey(ctx context.Context, key string) (SignatureValidator, error) {
log := zerolog.Ctx(ctx)
// const didKey = "did:key:"
// if !strings.HasPrefix(key, didKey) {
// return nil, fmt.Errorf("expected the key %q to have prefix %q", key, didKey)
// }
// key = strings.TrimPrefix(key, didKey)
enc, val, err := multibase.Decode(key)
if err != nil {
return nil, fmt.Errorf("failed to decode key data: %w", err)
}
if enc != multibase.Base58BTC {
log.Info().Msgf("unexpected key encoding: %v", enc)
}
buf := bytes.NewBuffer(val)
kind, err := binary.ReadUvarint(buf)
if err != nil {
return nil, fmt.Errorf("failed to parse key type: %w", err)
}
data, _ := io.ReadAll(buf)
switch multicodec.Code(kind) {
case multicodec.P256Pub:
x, y := elliptic.UnmarshalCompressed(elliptic.P256(), data)
return func(digest, sig []byte) (bool, error) {
pk := &ecdsa.PublicKey{
Curve: elliptic.P256(),
X: x,
Y: y,
}
if len(sig) != 64 {
return false, fmt.Errorf("unexpected signature length: %d != 64", len(sig))
}
r := big.NewInt(0).SetBytes(sig[:32])
s := big.NewInt(0).SetBytes(sig[32:])
return ecdsa.Verify(pk, digest, r, s), nil
}, nil
case multicodec.Secp256k1Pub:
pk, err := secec.NewPublicKey(data)
if err != nil {
return nil, fmt.Errorf("failed to parse secp256k public key: %w", err)
}
return func(digest, sig []byte) (bool, error) {
return pk.Verify(digest, sig, &secec.ECDSAOptions{
Hash: crypto.SHA256,
Encoding: secec.EncodingCompact,
RejectMalleable: true,
}), nil
}, nil
default:
return nil, fmt.Errorf("unsupported key type %q", multicodec.Code(kind))
}
}
func verifyCommitSignature(ctx context.Context, data []byte, key string) (bool, error) {
validateSignature, err := parseSigningKey(ctx, key)
if err != nil {
return false, fmt.Errorf("failed to parse the key: %w", err)
}
type Commit struct {
DID string
Version int
Data cid.Cid
Rev string
Prev *cid.Cid
Sig []byte
}
builder := basicnode.Prototype.Any.NewBuilder()
if err := (&dagcbor.DecodeOptions{AllowLinks: true}).Decode(builder, bytes.NewReader(data)); err != nil {
return false, fmt.Errorf("unmarshaling commit: %w", err)
}
node := builder.Build()
if node.Kind() != datamodel.Kind_Map {
return false, fmt.Errorf("commit must be a Map, got %s instead", node.Kind())
}
m, err := parseMap(node)
if err != nil {
return false, err
}
commit := Commit{}
if n, found := m["version"]; !found {
return false, fmt.Errorf("missing \"version\"")
} else {
v, err := n.AsInt()
if err != nil {
return false, fmt.Errorf("failed to parse \"version\": %w", err)
}
commit.Version = int(v)
}
if n, found := m["did"]; !found {
return false, fmt.Errorf("missing \"did\"")
} else {
v, err := n.AsString()
if err != nil {
return false, fmt.Errorf("failed to parse \"did\": %w", err)
}
commit.DID = v
}
if n, found := m["data"]; !found {
return false, fmt.Errorf("missing \"data\"")
} else {
v, err := n.AsLink()
if err != nil {
return false, fmt.Errorf("failed to parse \"data\": %w", err)
}
c, err := cid.Parse([]byte(v.Binary()))
if err != nil {
return false, fmt.Errorf("failed to convert \"data\" to CID: %w", err)
}
commit.Data = c
}
if n, found := m["rev"]; !found {
return false, fmt.Errorf("missing \"rev\"")
} else {
v, err := n.AsString()
if err != nil {
return false, fmt.Errorf("failed to parse \"rev\": %w", err)
}
commit.Rev = v
}
if n, found := m["prev"]; !found {
return false, fmt.Errorf("missing \"prev\"")
} else {
if !n.IsNull() {
v, err := n.AsLink()
if err != nil {
return false, fmt.Errorf("failed to parse \"prev\": %w", err)
}
c, err := cid.Parse([]byte(v.Binary()))
if err != nil {
return false, fmt.Errorf("failed to convert \"prev\" to CID: %w", err)
}
commit.Prev = &c
}
}
if n, found := m["sig"]; !found {
return false, fmt.Errorf("missing \"sig\"")
} else {
v, err := n.AsBytes()
if err != nil {
return false, fmt.Errorf("failed to parse \"sig\": %w", err)
}
commit.Sig = v
}
if commit.Version != 3 {
return false, fmt.Errorf("unknown commit version %d", commit.Version)
}
unsignedBuilder := basicnode.Prototype.Map.NewBuilder()
mb, err := unsignedBuilder.BeginMap(int64(len(m) - 1))
if err != nil {
return false, fmt.Errorf("initializing a map for unsigned commit: %w", err)
}
// XXX: signature validation depends on this specific order of keys in the map.
for _, k := range []string{"did", "rev", "data", "prev", "version"} {
if k == "sig" {
continue
}
if err := mb.AssembleKey().AssignString(k); err != nil {
return false, fmt.Errorf("failed to assemble key %q: %w", k, err)
}
if err := mb.AssembleValue().AssignNode(m[k]); err != nil {
return false, fmt.Errorf("failed to assemble value for key %q: %w", k, err)
}
}
if err := mb.Finish(); err != nil {
return false, fmt.Errorf("failed to finalize the map: %w", err)
}
unsignedNode := unsignedBuilder.Build()
buf := bytes.NewBuffer(nil)
if err := (&dagcbor.EncodeOptions{AllowLinks: true}).Encode(unsignedNode, buf); err != nil {
return false, fmt.Errorf("failed to serialize unsigned commit: %w", err)
}
unsignedBytes := buf.Bytes()
unsignedHash := sha256.Sum256(unsignedBytes)
return validateSignature(unsignedHash[:], commit.Sig)
}

View File

@ -1,4 +0,0 @@
SELECT pid, age(clock_timestamp(), query_start), state, query
FROM pg_stat_activity
WHERE query != '<IDLE>' AND query NOT ILIKE '%pg_stat_activity%' AND state <> 'idle'
ORDER BY query_start asc;