blob: c9c7f54c1bbcfdd6177a0e1a0ee22f4afafc9367 [file] [log] [blame] [edit]
// Command line application to filter the output of a CSV file downloaded from
// Perf. This application strips out duplicated columns from the same day,
// keeping only the first column to appear for each day, and then emits the
// altered CSV file on stdout.
//
// See the unit test for an example.
package main
import (
"encoding/csv"
"flag"
"fmt"
"io"
"math"
"os"
"regexp"
"sort"
"strconv"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/go/util"
"go.skia.org/infra/go/util_generics"
)
var (
in = flag.String("in", "", "input filename")
// Matches RFC3339 dates.
datetime = regexp.MustCompile(`^((?:(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2}(?:\.\d+)?))(Z|[\+-]\d{2}:\d{2})?)$`)
)
func removeValueFromSliceAtIndex(s []string, index int) []string {
ret := make([]string, 0, len(s)-1)
ret = append(ret, s[:index]...)
return append(ret, s[index+1:]...)
}
// Note that skipCols should be in reverse sorted order, i.e. largest to
// smallest, for this to work correctly.
func removeAllIndexesFromSlices(s []string, skipCols []int) []string {
for _, col := range skipCols {
s = removeValueFromSliceAtIndex(s, col)
}
return s
}
// Returns the max value over all the floats found.
func max(s []string) string {
var ret float64 = -math.MaxFloat64
for _, floatAsString := range s {
v, err := strconv.ParseFloat(floatAsString, 64)
if err != nil {
continue
}
if v > ret {
ret = v
}
}
if ret == -math.MaxFloat64 {
return s[0]
}
return fmt.Sprintf("%g", ret)
}
func applyMaxToRuns(s []string, runLengths map[int]int) []string {
for runStart, runLength := range runLengths {
s[runStart] = max(s[runStart : runStart+runLength])
}
return s
}
func main() {
var inputFilename string
// Create and parse flags.
fs := flag.NewFlagSet("csv2days", flag.ExitOnError)
fs.StringVar(&inputFilename, "in", "", "input filename")
err := fs.Parse(os.Args[1:])
if err != nil {
sklog.Fatal(err)
}
if inputFilename == "" {
fmt.Println("The --in flag must be supplied.")
flag.Usage()
os.Exit(1)
}
err = util.WithReadFile(inputFilename, func(f io.Reader) error {
return transformCSV(f, os.Stdout)
})
if err != nil {
sklog.Fatal(err)
}
}
func transformCSV(input io.Reader, output io.Writer) error {
in := csv.NewReader(input)
out := csv.NewWriter(output)
// Read in the header.
header, err := in.Read()
if err != nil {
return skerr.Wrap(err)
}
// Determine which columns to drop from output.
lastDate := ""
// The columns to ignore.
skipCols := []int{}
// The number of columns that appear on a single day.
runLengths := map[int]int{}
outHeader := []string{}
beginningOfCurrentRun := 0
for index, h := range header {
if !datetime.MatchString(h) {
outHeader = append(outHeader, h)
beginningOfCurrentRun = index
continue
}
// Preserve just the date. Also record all the columns that need to be
// combined into a single value.
day := h[:10]
if day == lastDate {
skipCols = append(skipCols, index)
runLengths[beginningOfCurrentRun] = util_generics.Get(runLengths, beginningOfCurrentRun, 1) + 1
} else {
outHeader = append(outHeader, day)
lastDate = day
beginningOfCurrentRun = index
}
}
err = out.Write(outHeader)
if err != nil {
return skerr.Wrap(err)
}
sort.Sort(sort.Reverse(sort.IntSlice(skipCols)))
for {
record, err := in.Read()
if err == io.EOF {
break
}
if err != nil {
return skerr.Wrap(err)
}
record = applyMaxToRuns(record, runLengths)
record = removeAllIndexesFromSlices(record, skipCols)
err = out.Write(record)
if err != nil {
return skerr.Wrap(err)
}
}
out.Flush()
if out.Error() != nil {
return skerr.Wrap(err)
}
return nil
}