From 806c9cd19a24e27e91c31a1f56f140cbcae32c02 Mon Sep 17 00:00:00 2001
From: Neil Pankey <npankey@gmail.com>
Date: Thu, 17 Sep 2020 21:08:40 -0700
Subject: [PATCH] yajsv: UTF-16 and BOM handling

---
 main.go | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/main.go b/main.go
index 8ee1767..766b97e 100644
--- a/main.go
+++ b/main.go
@@ -2,8 +2,11 @@
 // a provided JSON Schema - https://json-schema.org/
 package main
 
+//go:generate go run gen_testdata.go
+
 import (
 	"bufio"
+	"bytes"
 	"flag"
 	"fmt"
 	"io"
@@ -15,21 +18,37 @@ import (
 	"strings"
 	"sync"
 
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/unicode"
+
 	"github.com/ghodss/yaml"
 	"github.com/mitchellh/go-homedir"
 	"github.com/xeipuuv/gojsonschema"
 )
 
 var (
-	version     = "v1.3.0-dev"
+	version     = "v1.4.0-dev"
 	schemaFlag  = flag.String("s", "", "primary JSON schema to validate against, required")
 	quietFlag   = flag.Bool("q", false, "quiet, only print validation failures and errors")
 	versionFlag = flag.Bool("v", false, "print version and exit")
+	bomFlag     = flag.Bool("b", false, "allow BOM in JSON files, error if seen and unset")
 
 	listFlags stringFlags
 	refFlags  stringFlags
 )
 
+// https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
+const (
+	bomUTF8    = "\xEF\xBB\xBF"
+	bomUTF16BE = "\xFE\xFF"
+	bomUTF16LE = "\xFF\xFE"
+)
+
+var (
+	encUTF16BE = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
+	encUTF16LE = unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
+)
+
 func init() {
 	flag.Var(&listFlags, "l", "validate JSON documents from newline separated paths and/or globs in a text file (relative to the basename of the file itself)")
 	flag.Var(&refFlags, "r", "referenced schema(s), can be globs and/or used multiple times")
@@ -131,7 +150,6 @@ func realMain(args []string, w io.Writer) int {
 			sem <- 0
 			defer func() { <-sem }()
 
-
 			loader, err := jsonLoader(path)
 			if err != nil {
 				msg := fmt.Sprintf("%s: error: load doc: %s", path, err)
@@ -190,14 +208,57 @@ func jsonLoader(path string) (gojsonschema.JSONLoader, error) {
 	}
 	switch filepath.Ext(path) {
 	case ".yml", ".yaml":
+		// TODO YAML requires the precense of a BOM to detect UTF-16
+		// text. Is there a decent hueristic to detect UTF-16 text
+		// missing a BOM so we can provide a better error message?
 		buf, err = yaml.YAMLToJSON(buf)
+	default:
+		buf, err = jsonDecodeCharset(buf)
 	}
 	if err != nil {
 		return nil, err
 	}
+	// TODO What if we have an empty document?
 	return gojsonschema.NewBytesLoader(buf), nil
 }
 
+// jsonDecodeCharset attempts to detect UTF-16 (LE or BE) JSON text and
+// decode as appropriate. It also skips a BOM at the start of the buffer
+// if `-b` was specified. Presence of a BOM is an error otherwise.
+func jsonDecodeCharset(buf []byte) ([]byte, error) {
+	if len(buf) < 2 { // UTF-8
+		return buf, nil
+	}
+
+	bom := ""
+	var enc encoding.Encoding
+	switch {
+	case bytes.HasPrefix(buf, []byte(bomUTF8)):
+		bom = bomUTF8
+	case bytes.HasPrefix(buf, []byte(bomUTF16BE)):
+		bom = bomUTF16BE
+		enc = encUTF16BE
+	case bytes.HasPrefix(buf, []byte(bomUTF16LE)):
+		bom = bomUTF16LE
+		enc = encUTF16LE
+	case buf[0] == 0:
+		enc = encUTF16BE
+	case buf[1] == 0:
+		enc = encUTF16LE
+	}
+
+	if bom != "" {
+		if !*bomFlag {
+			return nil, fmt.Errorf("unexpected BOM, see `-b` flag")
+		}
+		buf = buf[len(bom):]
+	}
+	if enc != nil {
+		return enc.NewDecoder().Bytes(buf)
+	}
+	return buf, nil
+}
+
 func printUsage() {
 	fmt.Fprintf(os.Stderr, `Usage: %s -s schema.(json|yml) [options] document.(json|yml) ...
 
-- 
GitLab