blob: ccecbac8157c5af8c0446224f0f9320ebd4cf6f0 [file] [log] [blame]
package resultstore
import (
"bytes"
"encoding/json"
"fmt"
"path"
"sort"
"strings"
"time"
"github.com/boltdb/bolt"
"go.skia.org/infra/ct/go/util"
"go.skia.org/infra/ct_pixel_diff/go/dynamicdiff"
"go.skia.org/infra/go/fileutil"
)
const (
// Sort parameter constants.
NUM_DIFF = "numDiff"
PERCENT_DIFF = "percentDiff"
RED_DIFF = "redDiff"
GREEN_DIFF = "greenDiff"
BLUE_DIFF = "blueDiff"
RANK = "rank"
DSC = "descending"
// URL search constants.
HTTP = "http://"
HTTPS = "https://"
WWW = "www."
TEXT = "text"
VALUE = "value"
// Constants used to report statistics.
BUCKET_0 = "[0-10)"
BUCKET_1 = "[10-20)"
BUCKET_2 = "[20-30)"
BUCKET_3 = "[30-40)"
BUCKET_4 = "[40-50)"
BUCKET_5 = "[50-60)"
BUCKET_6 = "[60-70)"
BUCKET_7 = "[70-80)"
BUCKET_8 = "[80-90)"
BUCKET_9 = "[90-100]"
NUM_TOTAL_RESULTS = "numTotalResults"
NUM_DYNAMIC_CONTENT = "numDynamicContent"
NUM_ZERO_DIFF = "numZeroDiff"
// Number of ResultRec instances to render in one render request.
CHUNK_SIZE = 20
)
var (
// Used as start time in order to return all runs in GetRunIDs.
BeginningOfTime = time.Date(2015, time.January, 02, 15, 04, 05, 0, time.UTC)
)
// ResultRec defines the struct stored in ResultStore that can be queried over
// the web.
type ResultRec struct {
// RunID is the unique ID of the CT pixel diff run, in the form
// userID-timestamp.
RunID string
// URL identifies the web page that was screenshotted.
URL string
// Rank is the popularity rank of the web page.
Rank int
// NoPatchImg is the imageID of the screenshot taken without the page.
NoPatchImg string
// WithPatchImg is the imageID of the screenshot taken with the patch.
WithPatchImg string
// DiffMetrics are the results of diffing NoPatchImg and WithPatchImg.
DiffMetrics *dynamicdiff.DynamicDiffMetrics
}
// HasBothImages checks if both the NoPatchImg and WithPatchImg for the
// ResultRec have been processed.
func (r *ResultRec) HasBothImages() bool {
return r.NoPatchImg != "" && r.WithPatchImg != ""
}
// ResultStore is an interface for storing results extracted from Cluster
// Telemetry Pixel Diff JSON metadata.
type ResultStore interface {
// Get returns a ResultRec from the ResultStore using the runID and url.
Get(runID, url string) (*ResultRec, error)
// GetAll returns all the ResultRecs associated with the runID.
GetAll(runID string) ([]*ResultRec, error)
// GetRunIDs returns all the runIDs in the database that fall in between the
// start and end times.
GetRunIDs(start time.Time, end time.Time) ([]string, error)
// Put adds a ResultRec to the ResultStore using the runID and url.
Put(runID, url string, rec *ResultRec) error
// RemoveRun removes all the data associated with the runID from the
// ResultStore.
RemoveRun(runID string) error
// GetFiltered returns cached results within the given bounds for the given runID.
GetFiltered(runID string, startIdx int, min float32, max float32) ([]*ResultRec, int, error)
// SortRun sorts the cached results for the given runID using the sort
// parameters.
SortRun(runID, sortField, sortOrder string) error
// GetURLs returns the URLs of all cached results for the given runID.
GetURLs(runID string) ([]map[string]string, error)
// GetStats returns various statistics and a histogram using cached results
// for the given runID.
GetStats(runID string) (map[string]int, map[string]int, error)
}
// BoltResultStore implements the ResultStore interface with a boltDB instance.
type BoltResultStore struct {
db *bolt.DB
// Map of runIDs to list of ResultRecs. The cache only contains entries that
// have computed diff metrics.
cache map[string][]*ResultRec
}
// NewBoltResultStore returns a new instance of BoltResultStore, using the given
// boltDir and boltName to create the boltDB instance.
func NewBoltResultStore(boltDir, boltName string) (ResultStore, error) {
// Make sure directory for boltDB instance exists.
boltDir, err := fileutil.EnsureDirExists(boltDir)
if err != nil {
return nil, err
}
// Create the boltDB instance.
db, err := bolt.Open(path.Join(boltDir, boltName), 0600, nil)
if err != nil {
return nil, err
}
// Instantiate the cache.
cache := map[string][]*ResultRec{}
b := &BoltResultStore{
db: db,
cache: cache,
}
// Fill the cache.
if err = b.fillCache(); err != nil {
return nil, err
}
return b, nil
}
// Fills the cache with the data in the boltDB instance. This is to ensure that
// the data in the boltDB and cache are consistent with each other even after
// a server crash or reboot, as the cache will be erased while the data in the
// boltDB will not.
func (b *BoltResultStore) fillCache() error {
runIDs, err := b.GetRunIDs(BeginningOfTime, time.Now())
if err != nil {
return err
}
for _, runID := range runIDs {
results, err := b.GetAll(runID)
if err != nil {
return err
}
b.cache[runID] = results
}
return nil
}
// Get uses the given runID to specify the storage bucket within the boltDB.
// Then, it uses the url as the key to get the serialized ResultRec, and returns
// it after decoding.
func (b *BoltResultStore) Get(runID, url string) (*ResultRec, error) {
rec := &ResultRec{}
viewFn := func(tx *bolt.Tx) error {
// Retrieve bucket using the runID. If the bucket doesn't exist, return nil.
b := tx.Bucket([]byte(runID))
if b == nil {
rec = nil
return nil
}
// Get the serialized ResultRec and decode it. If an entry doesn't exist
// for a given url, return nil.
bytes := b.Get([]byte(url))
if bytes == nil {
rec = nil
} else {
if err := json.Unmarshal(bytes, &rec); err != nil {
return err
}
}
return nil
}
err := b.db.View(viewFn)
if err != nil {
return nil, err
}
return rec, nil
}
// GetAll returns all the ResultRecs containing diff metrics that are stored in
// the bucket associated with the given runID.
func (b *BoltResultStore) GetAll(runID string) ([]*ResultRec, error) {
recs := []*ResultRec{}
viewFn := func(tx *bolt.Tx) error {
// Retrieve bucket using the runID. If the bucket doesn't exist, returns an
// empty list.
b := tx.Bucket([]byte(runID))
if b == nil {
return nil
}
// Iterate through all the entries in the bucket, deserialize the values,
// and append them to the list if they contain diff metrics.
err := b.ForEach(func(k, v []byte) error {
rec := &ResultRec{}
if err := json.Unmarshal(v, &rec); err != nil {
return err
}
if rec.DiffMetrics != nil {
recs = append(recs, rec)
}
return nil
})
if err != nil {
return err
}
return nil
}
err := b.db.View(viewFn)
if err != nil {
return nil, err
}
return recs, nil
}
// GetRunIDs returns the IDs of all the runs that were completed in the given
// time range. If this func is called with the parameters
// resultstore.BeginningOfTime and time.Now(), all runIDs in the database are
// returned.
func (b *BoltResultStore) GetRunIDs(start, end time.Time) ([]string, error) {
runIDs := []string{}
viewFn := func(tx *bolt.Tx) error {
// Iterate through each bucket name and create a Time struct using the
// timestamp in the runID.
err := tx.ForEach(func(name []byte, _ *bolt.Bucket) error {
runID := string(name)
timestamp := strings.Split(runID, "-")[1]
runTime, err := time.Parse(util.TS_FORMAT, timestamp)
if err != nil {
return err
}
// Append the runID to the list if it falls in the specified range.
if start.Before(runTime) && end.After(runTime) {
runIDs = append(runIDs, runID)
}
return nil
})
if err != nil {
return err
}
return nil
}
err := b.db.View(viewFn)
if err != nil {
return nil, err
}
return runIDs, nil
}
// Put uses the given runID to specify the storage bucket within the boltDB.
// Then, the ResultRec is encoded and put into the database with the url of
// the screenshots as the key. If a record already exists for the given runID
// and url, it is overwritten. If the update succeeds and the ResultRec has
// diff metrics, it is also added to the cache.
func (b *BoltResultStore) Put(runID, url string, rec *ResultRec) error {
updateFn := func(tx *bolt.Tx) error {
// Create or retrieve bucket using the runID.
b, err := tx.CreateBucketIfNotExists([]byte(runID))
if err != nil {
return err
}
// Serialize the ResultRec.
encoded, err := json.Marshal(rec)
if err != nil {
return err
}
// Put the record in the runID bucket: key = url, value = ResultRec
if err := b.Put([]byte(url), encoded); err != nil {
return err
}
return nil
}
err := b.db.Update(updateFn)
if err != nil {
return err
}
// Add the ResultRec to the cache if it has diff metrics.
if rec.DiffMetrics != nil {
if results, ok := b.cache[runID]; ok {
results = append(results, rec)
b.cache[runID] = results
} else {
results = []*ResultRec{rec}
b.cache[runID] = results
}
}
return nil
}
// RemoveRun deletes the bucket specified by the runID from the boltDB instance.
// If the remove succeeds, the runID is also removed from the cache.
func (b *BoltResultStore) RemoveRun(runID string) error {
updateFn := func(tx *bolt.Tx) error {
if err := tx.DeleteBucket([]byte(runID)); err != nil {
return err
}
return nil
}
err := b.db.Update(updateFn)
if err != nil {
return err
}
// Remove the runID from the cache after verifying it's there.
if _, ok := b.cache[runID]; ok {
delete(b.cache, runID)
}
return nil
}
// GetFiltered returns at most CHUNK_SIZE ResultRecs whose PixelDiffPercent is
// greater than or equal to the given min and less than or equal to the given
// max. The filtering starts at the given start index for the runID's cached
// results. Also returns the next index so that the frontend knows where to
// start its next filter query in the cache. Returns an error if there is no
// data cached for the runID.
func (b *BoltResultStore) GetFiltered(runID string, startIdx int, min float32, max float32) ([]*ResultRec, int, error) {
results, ok := b.cache[runID]
if !ok {
return nil, -1, fmt.Errorf("No cached results for run %s", runID)
}
ret := []*ResultRec{}
i := 0
for i < CHUNK_SIZE {
if startIdx >= len(results) {
return ret, len(results), nil
}
percent := results[startIdx].DiffMetrics.PixelDiffPercent
if min <= percent && percent <= max {
ret = append(ret, results[startIdx])
i++
}
startIdx++
}
return ret, startIdx, nil
}
// SortRun sorts the cached ResultRecs for the given runID using the given sort
// parameter and sort order (ascending/descending). Returns an error if there
// is no data cached for the runID.
func (b *BoltResultStore) SortRun(runID, sortField, sortOrder string) error {
results, ok := b.cache[runID]
if !ok {
return fmt.Errorf("No cached results for run %s", runID)
}
var lessFn resultRecLessFn
switch sortField {
// The ResultRecs are sorted by URL if they have equal values for the sort
// parameter.
case NUM_DIFF:
lessFn = sortByNumDiffPixels
case PERCENT_DIFF:
lessFn = sortByPercentDiffPixels
case RED_DIFF:
lessFn = sortByMaxRedDiff
case GREEN_DIFF:
lessFn = sortByMaxGreenDiff
case BLUE_DIFF:
lessFn = sortByMaxBlueDiff
case RANK:
lessFn = sortByRank
}
sortSlice := sort.Interface(newResultRecSlice(results, lessFn))
if sortOrder == DSC {
sortSlice = sort.Reverse(sortSlice)
}
sort.Sort(sortSlice)
return nil
}
// Function signature for a ResultRec comparator.
type resultRecLessFn func(r *resultRecSlice, i, j int) bool
// resultRecSlice wraps around a list of ResultRec instances and implements
// sort.Interface.
type resultRecSlice struct {
lessFn resultRecLessFn
data []*ResultRec
}
// Constructor takes in a slice of ResultRec instances and a custom less
// function that is called during sorting.
func newResultRecSlice(data []*ResultRec, lessFn resultRecLessFn) *resultRecSlice {
return &resultRecSlice{lessFn: lessFn, data: data}
}
// Implementation of sort.Interface.
func (r *resultRecSlice) Len() int { return len(r.data) }
func (r *resultRecSlice) Less(i, j int) bool { return r.lessFn(r, i, j) }
func (r *resultRecSlice) Swap(i, j int) { r.data[i], r.data[j] = r.data[j], r.data[i] }
// Sorts the slice using the number of different pixels.
func sortByNumDiffPixels(r *resultRecSlice, i, j int) bool {
left := r.data[i].DiffMetrics.NumDiffPixels
right := r.data[j].DiffMetrics.NumDiffPixels
if left == right {
return r.data[i].URL < r.data[j].URL
}
return left < right
}
// Sorts the slice using the percentage of different pixels.
func sortByPercentDiffPixels(r *resultRecSlice, i, j int) bool {
left := r.data[i].DiffMetrics.PixelDiffPercent
right := r.data[j].DiffMetrics.PixelDiffPercent
if left == right {
return r.data[i].URL < r.data[j].URL
}
return left < right
}
// Sorts the slice using the maximum red difference.
func sortByMaxRedDiff(r *resultRecSlice, i, j int) bool {
left := r.data[i].DiffMetrics.MaxRGBDiffs[0]
right := r.data[j].DiffMetrics.MaxRGBDiffs[0]
if left == right {
return r.data[i].URL < r.data[j].URL
}
return left < right
}
// Sorts the slice using the maximum green difference.
func sortByMaxGreenDiff(r *resultRecSlice, i, j int) bool {
left := r.data[i].DiffMetrics.MaxRGBDiffs[1]
right := r.data[j].DiffMetrics.MaxRGBDiffs[1]
if left == right {
return r.data[i].URL < r.data[j].URL
}
return left < right
}
// Sorts the slice using the maximum blue difference.
func sortByMaxBlueDiff(r *resultRecSlice, i, j int) bool {
left := r.data[i].DiffMetrics.MaxRGBDiffs[2]
right := r.data[j].DiffMetrics.MaxRGBDiffs[2]
if left == right {
return r.data[i].URL < r.data[j].URL
}
return left < right
}
// Sorts the slice using the site popularity rank. Two ResultRec instances
// within the same slice will never have the same rank, so there is no need for
// an equality check.
func sortByRank(r *resultRecSlice, i, j int) bool {
return r.data[i].Rank > r.data[j].Rank
}
// GetURLs returns the urls of the cached results for the given runID. Urls are
// returned as map[string]string objects, where the entries are as follows:
// "text":URL stripped of prefixes, "value":stripped prefixes. These fields are
// required by the frontend element responsible for making url suggestions.
// Returns an error if there is no data cached for the runID.
func (b *BoltResultStore) GetURLs(runID string) ([]map[string]string, error) {
results, ok := b.cache[runID]
if !ok {
return nil, fmt.Errorf("No cached results for run %s", runID)
}
urls := []map[string]string{}
for _, result := range results {
url := map[string]string{}
var prefix bytes.Buffer
formattedURL := result.URL
// Strips either "http://" or "https://"
if strings.HasPrefix(formattedURL, HTTP) {
prefix.WriteString(HTTP)
formattedURL = strings.TrimPrefix(formattedURL, HTTP)
} else if strings.HasPrefix(formattedURL, HTTPS) {
prefix.WriteString(HTTPS)
formattedURL = strings.TrimPrefix(formattedURL, HTTPS)
}
// Strips "www." prefix
if strings.HasPrefix(formattedURL, WWW) {
prefix.WriteString(WWW)
formattedURL = strings.TrimPrefix(formattedURL, WWW)
}
url[TEXT] = formattedURL
url[VALUE] = prefix.String()
urls = append(urls, url)
}
return urls, nil
}
// GetStats returns two maps, one that contains basic count statistics such as
// the total number of diff results and number of results with dynamic content,
// and another that represents a histogram of pixel diff percentages. Returns an
// error if there is no data cached for the runID.
func (b *BoltResultStore) GetStats(runID string) (map[string]int, map[string]int, error) {
results, ok := b.cache[runID]
if !ok {
return nil, nil, fmt.Errorf("No cached results for run %s", runID)
}
stats := map[string]int{
NUM_TOTAL_RESULTS: 0,
NUM_DYNAMIC_CONTENT: 0,
NUM_ZERO_DIFF: 0,
}
histogram := map[string]int{
BUCKET_0: 0,
BUCKET_1: 0,
BUCKET_2: 0,
BUCKET_3: 0,
BUCKET_4: 0,
BUCKET_5: 0,
BUCKET_6: 0,
BUCKET_7: 0,
BUCKET_8: 0,
BUCKET_9: 0,
}
for _, result := range results {
// Increment the total number of diff results.
stats[NUM_TOTAL_RESULTS]++
// Increment the number of results with dynamic content if the result has
// at least one dynamic pixel.
if result.DiffMetrics.NumDynamicPixels > 0 {
stats[NUM_DYNAMIC_CONTENT]++
}
percent := result.DiffMetrics.PixelDiffPercent
// Place the percent in the correct histogram bin.
if 0 <= percent && percent < 10 {
// Increment the number of results with zero diff if the result has
// a pixel diff percentage of exactly 0%.
if percent == 0 {
stats[NUM_ZERO_DIFF]++
}
histogram[BUCKET_0]++
} else if 10 <= percent && percent < 20 {
histogram[BUCKET_1]++
} else if 20 <= percent && percent < 30 {
histogram[BUCKET_2]++
} else if 30 <= percent && percent < 40 {
histogram[BUCKET_3]++
} else if 40 <= percent && percent < 50 {
histogram[BUCKET_4]++
} else if 50 <= percent && percent < 60 {
histogram[BUCKET_5]++
} else if 60 <= percent && percent < 70 {
histogram[BUCKET_6]++
} else if 70 <= percent && percent < 80 {
histogram[BUCKET_7]++
} else if 80 <= percent && percent < 90 {
histogram[BUCKET_8]++
} else if 90 <= percent && percent <= 100 {
histogram[BUCKET_9]++
}
}
return stats, histogram, nil
}