Add AtRev column to only overwrite records with a newer version

main
Max Ignatenko 2024-02-17 14:29:45 +00:00
parent 1d3c6edf0a
commit 1038ca3bea
3 changed files with 50 additions and 7 deletions

View File

@ -10,6 +10,7 @@ import (
"net/http"
"net/url"
"path"
"regexp"
"strings"
"time"
@ -177,6 +178,12 @@ func (c *Consumer) updateCursor(ctx context.Context, seq int64) error {
}
var postgresFixRegexp = regexp.MustCompile(`[^\\](\\\\)*(\\u0000)`)
func escapeNullCharForPostgres(b []byte) []byte {
return postgresFixRegexp.ReplaceAll(b, []byte(`$1<0x00>`))
}
func (c *Consumer) processMessage(ctx context.Context, typ string, r io.Reader, first bool) error {
log := zerolog.Ctx(ctx)
@ -253,7 +260,11 @@ func (c *Consumer) processMessage(ctx context.Context, typ string, r io.Reader,
Repo: models.ID(repoInfo.ID),
Collection: parts[0],
Rkey: parts[1],
Content: v,
// XXX: proper replacement of \u0000 would require full parsing of JSON
// and recursive iteration over all string values, but this
// should work well enough for now.
Content: escapeNullCharForPostgres(v),
AtRev: payload.Rev,
})
}
if len(recs) == 0 && expectRecords {
@ -261,8 +272,16 @@ func (c *Consumer) processMessage(ctx context.Context, typ string, r io.Reader,
}
if len(recs) > 0 || expectRecords {
err = c.db.Model(&repo.Record{}).
Clauses(clause.OnConflict{DoUpdates: clause.AssignmentColumns([]string{"content"}),
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
Clauses(clause.OnConflict{
Where: clause.Where{Exprs: []clause.Expression{clause.Or(
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: nil},
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: ""},
clause.Lt{
Column: clause.Column{Name: "at_rev", Table: "records"},
Value: clause.Column{Name: "at_rev", Table: "excluded"}},
)}},
DoUpdates: clause.AssignmentColumns([]string{"content", "at_rev"}),
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
Create(recs).Error
if err != nil {
return fmt.Errorf("inserting records into the database: %w", err)

View File

@ -5,6 +5,7 @@ import (
"context"
"fmt"
"net/url"
"regexp"
"strings"
"time"
@ -131,6 +132,12 @@ func (p *WorkerPool) worker(ctx context.Context, signal chan struct{}) {
}
}
var postgresFixRegexp = regexp.MustCompile(`[^\\](\\\\)*(\\u0000)`)
func escapeNullCharForPostgres(b []byte) []byte {
return postgresFixRegexp.ReplaceAll(b, []byte(`$1<0x00>`))
}
func (p *WorkerPool) doWork(ctx context.Context, work WorkItem) error {
log := zerolog.Ctx(ctx)
defer close(work.signal)
@ -201,19 +208,32 @@ retry:
log.Warn().Msgf("Unexpected key format: %q", k)
continue
}
v = regexp.MustCompile(`[^\\](\\\\)*(\\u0000)`).ReplaceAll(v, []byte(`$1<0x00>`))
recs = append(recs, repo.Record{
Repo: models.ID(work.Repo.ID),
Collection: parts[0],
Rkey: parts[1],
Content: v,
// XXX: proper replacement of \u0000 would require full parsing of JSON
// and recursive iteration over all string values, but this
// should work well enough for now.
Content: escapeNullCharForPostgres(v),
AtRev: newRev,
})
}
recordsFetched.Add(float64(len(recs)))
if len(recs) > 0 {
for _, batch := range splitInBatshes(recs, 500) {
result := p.db.Model(&repo.Record{}).
Clauses(clause.OnConflict{DoUpdates: clause.AssignmentColumns([]string{"content"}),
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
Clauses(clause.OnConflict{
Where: clause.Where{Exprs: []clause.Expression{clause.Or(
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: nil},
clause.Eq{Column: clause.Column{Name: "at_rev", Table: "records"}, Value: ""},
clause.Lt{
Column: clause.Column{Name: "at_rev", Table: "records"},
Value: clause.Column{Name: "at_rev", Table: "excluded"}},
)}},
DoUpdates: clause.AssignmentColumns([]string{"content", "at_rev"}),
Columns: []clause.Column{{Name: "repo"}, {Name: "collection"}, {Name: "rkey"}}}).
Create(batch)
if err := result.Error; err != nil {
return fmt.Errorf("inserting records into the database: %w", err)
@ -229,6 +249,9 @@ retry:
return fmt.Errorf("updating repo rev: %w", err)
}
// TODO: check for records that are missing in the repo download
// and mark them as deleted.
return nil
}

View File

@ -34,9 +34,10 @@ type Record struct {
ID models.ID `gorm:"primarykey"`
CreatedAt time.Time
UpdatedAt time.Time
Repo models.ID `gorm:"index:idx_repo_record_key,unique,priority:1;not null"`
Repo models.ID `gorm:"index:idx_repo_record_key,unique,priority:1;not null;index:idx_repo_rev"`
Collection string `gorm:"index:idx_repo_record_key,unique,priority:2;not null"`
Rkey string `gorm:"index:idx_repo_record_key,unique,priority:3"`
AtRev string `gorm:"index:idx_repo_rev"`
Content json.RawMessage `gorm:"type:JSONB"`
Deleted bool
}