258 lines
8.0 KiB
Go
258 lines
8.0 KiB
Go
|
/* Copyright 2016-2017 Vector Creations Ltd
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
package canonicaljson
|
||
|
|
||
|
import (
|
||
|
"encoding/binary"
|
||
|
"fmt"
|
||
|
"sort"
|
||
|
"unicode/utf8"
|
||
|
|
||
|
"github.com/tidwall/gjson"
|
||
|
)
|
||
|
|
||
|
// CanonicalJSON re-encodes the JSON in a canonical encoding. The encoding is
|
||
|
// the shortest possible encoding using integer values with sorted object keys.
|
||
|
// https://matrix.org/docs/spec/appendices#canonical-json
|
||
|
func CanonicalJSON(input []byte) ([]byte, error) {
|
||
|
if !gjson.Valid(string(input)) {
|
||
|
return nil, fmt.Errorf("invalid json")
|
||
|
}
|
||
|
|
||
|
return CanonicalJSONAssumeValid(input), nil
|
||
|
}
|
||
|
|
||
|
// CanonicalJSONAssumeValid is the same as CanonicalJSON, but assumes the
|
||
|
// input is valid JSON
|
||
|
func CanonicalJSONAssumeValid(input []byte) []byte {
|
||
|
input = CompactJSON(input, make([]byte, 0, len(input)))
|
||
|
return SortJSON(input, make([]byte, 0, len(input)))
|
||
|
}
|
||
|
|
||
|
// SortJSON reencodes the JSON with the object keys sorted by lexicographically
|
||
|
// by codepoint. The input must be valid JSON.
|
||
|
func SortJSON(input, output []byte) []byte {
|
||
|
result := gjson.ParseBytes(input)
|
||
|
|
||
|
return sortJSONValue(result, input, output)
|
||
|
}
|
||
|
|
||
|
// sortJSONValue takes a gjson.Result and sorts it. inputJSON must be the
|
||
|
// raw JSON bytes that gjson.Result points to.
|
||
|
func sortJSONValue(input gjson.Result, inputJSON, output []byte) []byte {
|
||
|
if input.IsArray() {
|
||
|
return sortJSONArray(input, inputJSON, output)
|
||
|
}
|
||
|
|
||
|
if input.IsObject() {
|
||
|
return sortJSONObject(input, inputJSON, output)
|
||
|
}
|
||
|
|
||
|
// If its neither an object nor an array then there is no sub structure
|
||
|
// to sort, so just append the raw bytes.
|
||
|
return append(output, input.Raw...)
|
||
|
}
|
||
|
|
||
|
// sortJSONArray takes a gjson.Result and sorts it, assuming its an array.
|
||
|
// inputJSON must be the raw JSON bytes that gjson.Result points to.
|
||
|
func sortJSONArray(input gjson.Result, inputJSON, output []byte) []byte {
|
||
|
sep := byte('[')
|
||
|
|
||
|
// Iterate over each value in the array and sort it.
|
||
|
input.ForEach(func(_, value gjson.Result) bool {
|
||
|
output = append(output, sep)
|
||
|
sep = ','
|
||
|
output = sortJSONValue(value, inputJSON, output)
|
||
|
return true // keep iterating
|
||
|
})
|
||
|
|
||
|
if sep == '[' {
|
||
|
// If sep is still '[' then the array was empty and we never wrote the
|
||
|
// initial '[', so we write it now along with the closing ']'.
|
||
|
output = append(output, '[', ']')
|
||
|
} else {
|
||
|
// Otherwise we end the array by writing a single ']'
|
||
|
output = append(output, ']')
|
||
|
}
|
||
|
return output
|
||
|
}
|
||
|
|
||
|
// sortJSONObject takes a gjson.Result and sorts it, assuming its an object.
|
||
|
// inputJSON must be the raw JSON bytes that gjson.Result points to.
|
||
|
func sortJSONObject(input gjson.Result, inputJSON, output []byte) []byte {
|
||
|
type entry struct {
|
||
|
key string // The parsed key string
|
||
|
rawKey string // The raw, unparsed key JSON string
|
||
|
value gjson.Result
|
||
|
}
|
||
|
|
||
|
var entries []entry
|
||
|
|
||
|
// Iterate over each key/value pair and add it to a slice
|
||
|
// that we can sort
|
||
|
input.ForEach(func(key, value gjson.Result) bool {
|
||
|
entries = append(entries, entry{
|
||
|
key: key.String(),
|
||
|
rawKey: key.Raw,
|
||
|
value: value,
|
||
|
})
|
||
|
return true // keep iterating
|
||
|
})
|
||
|
|
||
|
// Sort the slice based on the *parsed* key
|
||
|
sort.Slice(entries, func(a, b int) bool {
|
||
|
return entries[a].key < entries[b].key
|
||
|
})
|
||
|
|
||
|
sep := byte('{')
|
||
|
|
||
|
for _, entry := range entries {
|
||
|
output = append(output, sep)
|
||
|
sep = ','
|
||
|
|
||
|
// Append the raw unparsed JSON key, *not* the parsed key
|
||
|
output = append(output, entry.rawKey...)
|
||
|
output = append(output, ':')
|
||
|
output = sortJSONValue(entry.value, inputJSON, output)
|
||
|
}
|
||
|
if sep == '{' {
|
||
|
// If sep is still '{' then the object was empty and we never wrote the
|
||
|
// initial '{', so we write it now along with the closing '}'.
|
||
|
output = append(output, '{', '}')
|
||
|
} else {
|
||
|
// Otherwise we end the object by writing a single '}'
|
||
|
output = append(output, '}')
|
||
|
}
|
||
|
return output
|
||
|
}
|
||
|
|
||
|
// CompactJSON makes the encoded JSON as small as possible by removing
|
||
|
// whitespace and unneeded unicode escapes
|
||
|
func CompactJSON(input, output []byte) []byte {
|
||
|
var i int
|
||
|
for i < len(input) {
|
||
|
c := input[i]
|
||
|
i++
|
||
|
// The valid whitespace characters are all less than or equal to SPACE 0x20.
|
||
|
// The valid non-white characters are all greater than SPACE 0x20.
|
||
|
// So we can check for whitespace by comparing against SPACE 0x20.
|
||
|
if c <= ' ' {
|
||
|
// Skip over whitespace.
|
||
|
continue
|
||
|
}
|
||
|
// Add the non-whitespace character to the output.
|
||
|
output = append(output, c)
|
||
|
if c == '"' {
|
||
|
// We are inside a string.
|
||
|
for i < len(input) {
|
||
|
c = input[i]
|
||
|
i++
|
||
|
// Check if this is an escape sequence.
|
||
|
if c == '\\' {
|
||
|
escape := input[i]
|
||
|
i++
|
||
|
if escape == 'u' {
|
||
|
// If this is a unicode escape then we need to handle it specially
|
||
|
output, i = compactUnicodeEscape(input, output, i)
|
||
|
} else if escape == '/' {
|
||
|
// JSON does not require escaping '/', but allows encoders to escape it as a special case.
|
||
|
// Since the escape isn't required we remove it.
|
||
|
output = append(output, escape)
|
||
|
} else {
|
||
|
// All other permitted escapes are single charater escapes that are already in their shortest form.
|
||
|
output = append(output, '\\', escape)
|
||
|
}
|
||
|
} else {
|
||
|
output = append(output, c)
|
||
|
}
|
||
|
if c == '"' {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return output
|
||
|
}
|
||
|
|
||
|
// compactUnicodeEscape unpacks a 4 byte unicode escape starting at index.
|
||
|
// If the escape is a surrogate pair then decode the 6 byte \uXXXX escape
|
||
|
// that follows. Returns the output slice and a new input index.
|
||
|
func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
|
||
|
const (
|
||
|
ESCAPES = "uuuuuuuubtnufruuuuuuuuuuuuuuuuuu"
|
||
|
HEX = "0123456789ABCDEF"
|
||
|
)
|
||
|
// If there aren't enough bytes to decode the hex escape then return.
|
||
|
if len(input)-index < 4 {
|
||
|
return output, len(input)
|
||
|
}
|
||
|
// Decode the 4 hex digits.
|
||
|
c := readHexDigits(input[index:])
|
||
|
index += 4
|
||
|
if c < ' ' {
|
||
|
// If the character is less than SPACE 0x20 then it will need escaping.
|
||
|
escape := ESCAPES[c]
|
||
|
output = append(output, '\\', escape)
|
||
|
if escape == 'u' {
|
||
|
output = append(output, '0', '0', byte('0'+(c>>4)), HEX[c&0xF])
|
||
|
}
|
||
|
} else if c == '\\' || c == '"' {
|
||
|
// Otherwise the character only needs escaping if it is a QUOTE '"' or BACKSLASH '\\'.
|
||
|
output = append(output, '\\', byte(c))
|
||
|
} else if c < 0xD800 || c >= 0xE000 {
|
||
|
// If the character isn't a surrogate pair then encoded it directly as UTF-8.
|
||
|
var buffer [4]byte
|
||
|
n := utf8.EncodeRune(buffer[:], rune(c))
|
||
|
output = append(output, buffer[:n]...)
|
||
|
} else {
|
||
|
// Otherwise the escaped character was the first part of a UTF-16 style surrogate pair.
|
||
|
// The next 6 bytes MUST be a '\uXXXX'.
|
||
|
// If there aren't enough bytes to decode the hex escape then return.
|
||
|
if len(input)-index < 6 {
|
||
|
return output, len(input)
|
||
|
}
|
||
|
// Decode the 4 hex digits from the '\uXXXX'.
|
||
|
surrogate := readHexDigits(input[index+2:])
|
||
|
index += 6
|
||
|
// Reconstruct the UCS4 codepoint from the surrogates.
|
||
|
codepoint := 0x10000 + (((c & 0x3FF) << 10) | (surrogate & 0x3FF))
|
||
|
// Encode the charater as UTF-8.
|
||
|
var buffer [4]byte
|
||
|
n := utf8.EncodeRune(buffer[:], rune(codepoint))
|
||
|
output = append(output, buffer[:n]...)
|
||
|
}
|
||
|
return output, index
|
||
|
}
|
||
|
|
||
|
// Read 4 hex digits from the input slice.
|
||
|
// Taken from https://github.com/NegativeMjark/indolentjson-rust/blob/8b959791fe2656a88f189c5d60d153be05fe3deb/src/readhex.rs#L21
|
||
|
func readHexDigits(input []byte) uint32 {
|
||
|
hex := binary.BigEndian.Uint32(input)
|
||
|
// subtract '0'
|
||
|
hex -= 0x30303030
|
||
|
// strip the higher bits, maps 'a' => 'A'
|
||
|
hex &= 0x1F1F1F1F
|
||
|
mask := hex & 0x10101010
|
||
|
// subtract 'A' - 10 - '9' - 9 = 7 from the letters.
|
||
|
hex -= mask >> 1
|
||
|
hex += mask >> 4
|
||
|
// collect the nibbles
|
||
|
hex |= hex >> 4
|
||
|
hex &= 0xFF00FF
|
||
|
hex |= hex >> 8
|
||
|
return hex & 0xFFFF
|
||
|
}
|