@@ -15,7 +15,6 @@ | |||
.settings | |||
.vscode | |||
target/ | |||
build/ | |||
out/ | |||
tmp/ | |||
dist/ |
@@ -0,0 +1,43 @@ | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
FROM golang:alpine as builder | |||
ENV PROJECT_DIR=${GOPATH}/src/github.com/microsoft/frameworkcontroller | |||
ENV INSTALL_DIR=/opt/frameworkcontroller/frameworkbarrier | |||
RUN apk update && apk add --no-cache bash && \ | |||
mkdir -p ${PROJECT_DIR} ${INSTALL_DIR} | |||
COPY . ${PROJECT_DIR} | |||
RUN ${PROJECT_DIR}/build/frameworkbarrier/go-build.sh && \ | |||
mv ${PROJECT_DIR}/dist/frameworkbarrier/* ${INSTALL_DIR} | |||
FROM alpine:latest | |||
ENV INSTALL_DIR=/opt/frameworkcontroller/frameworkbarrier | |||
RUN apk update && apk add --no-cache bash | |||
COPY --from=builder ${INSTALL_DIR} ${INSTALL_DIR} | |||
WORKDIR ${INSTALL_DIR} | |||
ENTRYPOINT ["./start.sh"] |
@@ -0,0 +1,37 @@ | |||
#!/usr/bin/env bash | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
set -o errexit | |||
set -o nounset | |||
set -o pipefail | |||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | |||
PROJECT_DIR=${BASH_DIR}/../.. | |||
IMAGE_NAME=frameworkbarrier | |||
cd ${PROJECT_DIR} | |||
docker build -t ${IMAGE_NAME} -f ${BASH_DIR}/Dockerfile . | |||
echo Succeeded to build docker image ${IMAGE_NAME} |
@@ -0,0 +1,44 @@ | |||
#!/usr/bin/env bash | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
set -o errexit | |||
set -o nounset | |||
set -o pipefail | |||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | |||
# Ensure ${PROJECT_DIR} is ${GOPATH}/src/github.com/microsoft/frameworkcontroller | |||
PROJECT_DIR=${BASH_DIR}/../.. | |||
DIST_DIR=${PROJECT_DIR}/dist/frameworkbarrier | |||
cd ${PROJECT_DIR} | |||
rm -rf ${DIST_DIR} | |||
mkdir -p ${DIST_DIR} | |||
go build -o ${DIST_DIR}/frameworkbarrier cmd/frameworkbarrier/* | |||
chmod a+x ${DIST_DIR}/frameworkbarrier | |||
cp -r bin/frameworkbarrier/* ${DIST_DIR} | |||
echo Succeeded to build binary distribution into ${DIST_DIR}: | |||
cd ${DIST_DIR} && ls -lR . |
@@ -0,0 +1,43 @@ | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
FROM golang:alpine as builder | |||
ENV PROJECT_DIR=${GOPATH}/src/github.com/microsoft/frameworkcontroller | |||
ENV INSTALL_DIR=/opt/frameworkcontroller/frameworkcontroller | |||
RUN apk update && apk add --no-cache bash && \ | |||
mkdir -p ${PROJECT_DIR} ${INSTALL_DIR} | |||
COPY . ${PROJECT_DIR} | |||
RUN ${PROJECT_DIR}/build/frameworkcontroller/go-build.sh && \ | |||
mv ${PROJECT_DIR}/dist/frameworkcontroller/* ${INSTALL_DIR} | |||
FROM alpine:latest | |||
ENV INSTALL_DIR=/opt/frameworkcontroller/frameworkcontroller | |||
RUN apk update && apk add --no-cache bash | |||
COPY --from=builder ${INSTALL_DIR} ${INSTALL_DIR} | |||
WORKDIR ${INSTALL_DIR} | |||
ENTRYPOINT ["./start.sh"] |
@@ -0,0 +1,37 @@ | |||
#!/usr/bin/env bash | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
set -o errexit | |||
set -o nounset | |||
set -o pipefail | |||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | |||
PROJECT_DIR=${BASH_DIR}/../.. | |||
IMAGE_NAME=frameworkcontroller | |||
cd ${PROJECT_DIR} | |||
docker build -t ${IMAGE_NAME} -f ${BASH_DIR}/Dockerfile . | |||
echo Succeeded to build docker image ${IMAGE_NAME} |
@@ -0,0 +1,45 @@ | |||
#!/usr/bin/env bash | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
set -o errexit | |||
set -o nounset | |||
set -o pipefail | |||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | |||
# Ensure ${PROJECT_DIR} is ${GOPATH}/src/github.com/microsoft/frameworkcontroller | |||
PROJECT_DIR=${BASH_DIR}/../.. | |||
DIST_DIR=${PROJECT_DIR}/dist/frameworkcontroller | |||
cd ${PROJECT_DIR} | |||
rm -rf ${DIST_DIR} | |||
mkdir -p ${DIST_DIR} | |||
go build -o ${DIST_DIR}/frameworkcontroller cmd/frameworkcontroller/* | |||
chmod a+x ${DIST_DIR}/frameworkcontroller | |||
cp -r bin/frameworkcontroller/* ${DIST_DIR} | |||
cp -r example/config/default/frameworkcontroller.yaml ${DIST_DIR} | |||
echo Succeeded to build binary distribution into ${DIST_DIR}: | |||
cd ${DIST_DIR} && ls -lR . |
@@ -0,0 +1,35 @@ | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
# Add tensorflow/tensorflow | |||
FROM tensorflow/tensorflow:1.12.0-devel | |||
ENV PROJECT_DIR=/tensorflow | |||
WORKDIR ${PROJECT_DIR} | |||
# Add tensorflow/benchmarks | |||
RUN git clone --branch=cnn_tf_v1.12_compatible --depth=1 \ | |||
https://github.com/tensorflow/benchmarks.git | |||
RUN ls -l . | |||
ENTRYPOINT ["/bin/bash"] |
@@ -0,0 +1,37 @@ | |||
#!/bin/bash | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
set -o errexit | |||
set -o nounset | |||
set -o pipefail | |||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | |||
PROJECT_DIR=${BASH_DIR} | |||
IMAGE_NAME=tensorflow-examples:cpu | |||
cd ${PROJECT_DIR} | |||
docker build -t ${IMAGE_NAME} -f ${BASH_DIR}/Dockerfile . | |||
echo Succeeded to build docker image ${IMAGE_NAME} |
@@ -0,0 +1,35 @@ | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
# Add tensorflow/tensorflow | |||
FROM tensorflow/tensorflow:1.12.0-devel-gpu | |||
ENV PROJECT_DIR=/tensorflow | |||
WORKDIR ${PROJECT_DIR} | |||
# Add tensorflow/benchmarks | |||
RUN git clone --branch=cnn_tf_v1.12_compatible --depth=1 \ | |||
https://github.com/tensorflow/benchmarks.git | |||
RUN ls -l . | |||
ENTRYPOINT ["/bin/bash"] |
@@ -0,0 +1,37 @@ | |||
#!/bin/bash | |||
# MIT License | |||
# | |||
# Copyright (c) Microsoft Corporation. All rights reserved. | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in all | |||
# copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE | |||
set -o errexit | |||
set -o nounset | |||
set -o pipefail | |||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | |||
PROJECT_DIR=${BASH_DIR} | |||
IMAGE_NAME=tensorflow-examples:gpu | |||
cd ${PROJECT_DIR} | |||
docker build -t ${IMAGE_NAME} -f ${BASH_DIR}/Dockerfile . | |||
echo Succeeded to build docker image ${IMAGE_NAME} |
@@ -0,0 +1,702 @@ | |||
// Copyright 2012 The Go Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style | |||
// license that can be found in the LICENSE file. | |||
package build // import "golang.org/x/text/collate/build" | |||
import ( | |||
"fmt" | |||
"io" | |||
"log" | |||
"sort" | |||
"strings" | |||
"unicode/utf8" | |||
"golang.org/x/text/internal/colltab" | |||
"golang.org/x/text/language" | |||
"golang.org/x/text/unicode/norm" | |||
) | |||
// TODO: optimizations: | |||
// - expandElem is currently 20K. By putting unique colElems in a separate | |||
// table and having a byte array of indexes into this table, we can reduce | |||
// the total size to about 7K. By also factoring out the length bytes, we | |||
// can reduce this to about 6K. | |||
// - trie valueBlocks are currently 100K. There are a lot of sparse blocks | |||
// and many consecutive values with the same stride. This can be further | |||
// compacted. | |||
// - Compress secondary weights into 8 bits. | |||
// - Some LDML specs specify a context element. Currently we simply concatenate | |||
// those. Context can be implemented using the contraction trie. If Builder | |||
// could analyze and detect when using a context makes sense, there is no | |||
// need to expose this construct in the API. | |||
// A Builder builds a root collation table. The user must specify the | |||
// collation elements for each entry. A common use will be to base the weights | |||
// on those specified in the allkeys* file as provided by the UCA or CLDR. | |||
type Builder struct { | |||
index *trieBuilder | |||
root ordering | |||
locale []*Tailoring | |||
t *table | |||
err error | |||
built bool | |||
minNonVar int // lowest primary recorded for a variable | |||
varTop int // highest primary recorded for a non-variable | |||
// indexes used for reusing expansions and contractions | |||
expIndex map[string]int // positions of expansions keyed by their string representation | |||
ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes | |||
ctElem map[string]int // contraction elements keyed by their string representation | |||
} | |||
// A Tailoring builds a collation table based on another collation table. | |||
// The table is defined by specifying tailorings to the underlying table. | |||
// See http://unicode.org/reports/tr35/ for an overview of tailoring | |||
// collation tables. The CLDR contains pre-defined tailorings for a variety | |||
// of languages (See http://www.unicode.org/Public/cldr/<version>/core.zip.) | |||
type Tailoring struct { | |||
id string | |||
builder *Builder | |||
index *ordering | |||
anchor *entry | |||
before bool | |||
} | |||
// NewBuilder returns a new Builder. | |||
func NewBuilder() *Builder { | |||
return &Builder{ | |||
index: newTrieBuilder(), | |||
root: makeRootOrdering(), | |||
expIndex: make(map[string]int), | |||
ctHandle: make(map[string]ctHandle), | |||
ctElem: make(map[string]int), | |||
} | |||
} | |||
// Tailoring returns a Tailoring for the given locale. One should | |||
// have completed all calls to Add before calling Tailoring. | |||
func (b *Builder) Tailoring(loc language.Tag) *Tailoring { | |||
t := &Tailoring{ | |||
id: loc.String(), | |||
builder: b, | |||
index: b.root.clone(), | |||
} | |||
t.index.id = t.id | |||
b.locale = append(b.locale, t) | |||
return t | |||
} | |||
// Add adds an entry to the collation element table, mapping | |||
// a slice of runes to a sequence of collation elements. | |||
// A collation element is specified as list of weights: []int{primary, secondary, ...}. | |||
// The entries are typically obtained from a collation element table | |||
// as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format. | |||
// Note that the collation elements specified by colelems are only used | |||
// as a guide. The actual weights generated by Builder may differ. | |||
// The argument variables is a list of indices into colelems that should contain | |||
// a value for each colelem that is a variable. (See the reference above.) | |||
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error { | |||
str := string(runes) | |||
elems := make([]rawCE, len(colelems)) | |||
for i, ce := range colelems { | |||
if len(ce) == 0 { | |||
break | |||
} | |||
elems[i] = makeRawCE(ce, 0) | |||
if len(ce) == 1 { | |||
elems[i].w[1] = defaultSecondary | |||
} | |||
if len(ce) <= 2 { | |||
elems[i].w[2] = defaultTertiary | |||
} | |||
if len(ce) <= 3 { | |||
elems[i].w[3] = ce[0] | |||
} | |||
} | |||
for i, ce := range elems { | |||
p := ce.w[0] | |||
isvar := false | |||
for _, j := range variables { | |||
if i == j { | |||
isvar = true | |||
} | |||
} | |||
if isvar { | |||
if p >= b.minNonVar && b.minNonVar > 0 { | |||
return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar) | |||
} | |||
if p > b.varTop { | |||
b.varTop = p | |||
} | |||
} else if p > 1 { // 1 is a special primary value reserved for FFFE | |||
if p <= b.varTop { | |||
return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop) | |||
} | |||
if b.minNonVar == 0 || p < b.minNonVar { | |||
b.minNonVar = p | |||
} | |||
} | |||
} | |||
elems, err := convertLargeWeights(elems) | |||
if err != nil { | |||
return err | |||
} | |||
cccs := []uint8{} | |||
nfd := norm.NFD.String(str) | |||
for i := range nfd { | |||
cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC()) | |||
} | |||
if len(cccs) < len(elems) { | |||
if len(cccs) > 2 { | |||
return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems)) | |||
} | |||
p := len(elems) - 1 | |||
for ; p > 0 && elems[p].w[0] == 0; p-- { | |||
elems[p].ccc = cccs[len(cccs)-1] | |||
} | |||
for ; p >= 0; p-- { | |||
elems[p].ccc = cccs[0] | |||
} | |||
} else { | |||
for i := range elems { | |||
elems[i].ccc = cccs[i] | |||
} | |||
} | |||
// doNorm in collate.go assumes that the following conditions hold. | |||
if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] { | |||
return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs) | |||
} | |||
b.root.newEntry(str, elems) | |||
return nil | |||
} | |||
func (t *Tailoring) setAnchor(anchor string) error { | |||
anchor = norm.NFC.String(anchor) | |||
a := t.index.find(anchor) | |||
if a == nil { | |||
a = t.index.newEntry(anchor, nil) | |||
a.implicit = true | |||
a.modified = true | |||
for _, r := range []rune(anchor) { | |||
e := t.index.find(string(r)) | |||
e.lock = true | |||
} | |||
} | |||
t.anchor = a | |||
return nil | |||
} | |||
// SetAnchor sets the point after which elements passed in subsequent calls to | |||
// Insert will be inserted. It is equivalent to the reset directive in an LDML | |||
// specification. See Insert for an example. | |||
// SetAnchor supports the following logical reset positions: | |||
// <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>, | |||
// and <last_non_ignorable/>. | |||
func (t *Tailoring) SetAnchor(anchor string) error { | |||
if err := t.setAnchor(anchor); err != nil { | |||
return err | |||
} | |||
t.before = false | |||
return nil | |||
} | |||
// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to | |||
// Insert will insert entries before the anchor. | |||
func (t *Tailoring) SetAnchorBefore(anchor string) error { | |||
if err := t.setAnchor(anchor); err != nil { | |||
return err | |||
} | |||
t.before = true | |||
return nil | |||
} | |||
// Insert sets the ordering of str relative to the entry set by the previous | |||
// call to SetAnchor or Insert. The argument extend corresponds | |||
// to the extend elements as defined in LDML. A non-empty value for extend | |||
// will cause the collation elements corresponding to extend to be appended | |||
// to the collation elements generated for the entry added by Insert. | |||
// This has the same net effect as sorting str after the string anchor+extend. | |||
// See http://www.unicode.org/reports/tr10/#Tailoring_Example for details | |||
// on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements | |||
// for full details on LDML. | |||
// | |||
// Examples: create a tailoring for Swedish, where "ä" is ordered after "z" | |||
// at the primary sorting level: | |||
// t := b.Tailoring("se") | |||
// t.SetAnchor("z") | |||
// t.Insert(colltab.Primary, "ä", "") | |||
// Order "ü" after "ue" at the secondary sorting level: | |||
// t.SetAnchor("ue") | |||
// t.Insert(colltab.Secondary, "ü","") | |||
// or | |||
// t.SetAnchor("u") | |||
// t.Insert(colltab.Secondary, "ü", "e") | |||
// Order "q" afer "ab" at the secondary level and "Q" after "q" | |||
// at the tertiary level: | |||
// t.SetAnchor("ab") | |||
// t.Insert(colltab.Secondary, "q", "") | |||
// t.Insert(colltab.Tertiary, "Q", "") | |||
// Order "b" before "a": | |||
// t.SetAnchorBefore("a") | |||
// t.Insert(colltab.Primary, "b", "") | |||
// Order "0" after the last primary ignorable: | |||
// t.SetAnchor("<last_primary_ignorable/>") | |||
// t.Insert(colltab.Primary, "0", "") | |||
func (t *Tailoring) Insert(level colltab.Level, str, extend string) error { | |||
if t.anchor == nil { | |||
return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str) | |||
} | |||
str = norm.NFC.String(str) | |||
e := t.index.find(str) | |||
if e == nil { | |||
e = t.index.newEntry(str, nil) | |||
} else if e.logical != noAnchor { | |||
return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str) | |||
} | |||
if e.lock { | |||
return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str) | |||
} | |||
a := t.anchor | |||
// Find the first element after the anchor which differs at a level smaller or | |||
// equal to the given level. Then insert at this position. | |||
// See http://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details. | |||
e.before = t.before | |||
if t.before { | |||
t.before = false | |||
if a.prev == nil { | |||
a.insertBefore(e) | |||
} else { | |||
for a = a.prev; a.level > level; a = a.prev { | |||
} | |||
a.insertAfter(e) | |||
} | |||
e.level = level | |||
} else { | |||
for ; a.level > level; a = a.next { | |||
} | |||
e.level = a.level | |||
if a != e { | |||
a.insertAfter(e) | |||
a.level = level | |||
} else { | |||
// We don't set a to prev itself. This has the effect of the entry | |||
// getting new collation elements that are an increment of itself. | |||
// This is intentional. | |||
a.prev.level = level | |||
} | |||
} | |||
e.extend = norm.NFD.String(extend) | |||
e.exclude = false | |||
e.modified = true | |||
e.elems = nil | |||
t.anchor = e | |||
return nil | |||
} | |||
func (o *ordering) getWeight(e *entry) []rawCE { | |||
if len(e.elems) == 0 && e.logical == noAnchor { | |||
if e.implicit { | |||
for _, r := range e.runes { | |||
e.elems = append(e.elems, o.getWeight(o.find(string(r)))...) | |||
} | |||
} else if e.before { | |||
count := [colltab.Identity + 1]int{} | |||
a := e | |||
for ; a.elems == nil && !a.implicit; a = a.next { | |||
count[a.level]++ | |||
} | |||
e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)} | |||
for i := colltab.Primary; i < colltab.Quaternary; i++ { | |||
if count[i] != 0 { | |||
e.elems[0].w[i] -= count[i] | |||
break | |||
} | |||
} | |||
if e.prev != nil { | |||
o.verifyWeights(e.prev, e, e.prev.level) | |||
} | |||
} else { | |||
prev := e.prev | |||
e.elems = nextWeight(prev.level, o.getWeight(prev)) | |||
o.verifyWeights(e, e.next, e.level) | |||
} | |||
} | |||
return e.elems | |||
} | |||
func (o *ordering) addExtension(e *entry) { | |||
if ex := o.find(e.extend); ex != nil { | |||
e.elems = append(e.elems, ex.elems...) | |||
} else { | |||
for _, r := range []rune(e.extend) { | |||
e.elems = append(e.elems, o.find(string(r)).elems...) | |||
} | |||
} | |||
e.extend = "" | |||
} | |||
func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error { | |||
if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil { | |||
return nil | |||
} | |||
for i := colltab.Primary; i < level; i++ { | |||
if a.elems[0].w[i] < b.elems[0].w[i] { | |||
return nil | |||
} | |||
} | |||
if a.elems[0].w[level] >= b.elems[0].w[level] { | |||
err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems) | |||
log.Println(err) | |||
// TODO: return the error instead, or better, fix the conflicting entry by making room. | |||
} | |||
return nil | |||
} | |||
func (b *Builder) error(e error) { | |||
if e != nil { | |||
b.err = e | |||
} | |||
} | |||
func (b *Builder) errorID(locale string, e error) { | |||
if e != nil { | |||
b.err = fmt.Errorf("%s:%v", locale, e) | |||
} | |||
} | |||
// patchNorm ensures that NFC and NFD counterparts are consistent. | |||
func (o *ordering) patchNorm() { | |||
// Insert the NFD counterparts, if necessary. | |||
for _, e := range o.ordered { | |||
nfd := norm.NFD.String(e.str) | |||
if nfd != e.str { | |||
if e0 := o.find(nfd); e0 != nil && !e0.modified { | |||
e0.elems = e.elems | |||
} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) { | |||
e := o.newEntry(nfd, e.elems) | |||
e.modified = true | |||
} | |||
} | |||
} | |||
// Update unchanged composed forms if one of their parts changed. | |||
for _, e := range o.ordered { | |||
nfd := norm.NFD.String(e.str) | |||
if e.modified || nfd == e.str { | |||
continue | |||
} | |||
if e0 := o.find(nfd); e0 != nil { | |||
e.elems = e0.elems | |||
} else { | |||
e.elems = o.genColElems(nfd) | |||
if norm.NFD.LastBoundary([]byte(nfd)) == 0 { | |||
r := []rune(nfd) | |||
head := string(r[0]) | |||
tail := "" | |||
for i := 1; i < len(r); i++ { | |||
s := norm.NFC.String(head + string(r[i])) | |||
if e0 := o.find(s); e0 != nil && e0.modified { | |||
head = s | |||
} else { | |||
tail += string(r[i]) | |||
} | |||
} | |||
e.elems = append(o.genColElems(head), o.genColElems(tail)...) | |||
} | |||
} | |||
} | |||
// Exclude entries for which the individual runes generate the same collation elements. | |||
for _, e := range o.ordered { | |||
if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) { | |||
e.exclude = true | |||
} | |||
} | |||
} | |||
func (b *Builder) buildOrdering(o *ordering) { | |||
for _, e := range o.ordered { | |||
o.getWeight(e) | |||
} | |||
for _, e := range o.ordered { | |||
o.addExtension(e) | |||
} | |||
o.patchNorm() | |||
o.sort() | |||
simplify(o) | |||
b.processExpansions(o) // requires simplify | |||
b.processContractions(o) // requires simplify | |||
t := newNode() | |||
for e := o.front(); e != nil; e, _ = e.nextIndexed() { | |||
if !e.skip() { | |||
ce, err := e.encode() | |||
b.errorID(o.id, err) | |||
t.insert(e.runes[0], ce) | |||
} | |||
} | |||
o.handle = b.index.addTrie(t) | |||
} | |||
func (b *Builder) build() (*table, error) { | |||
if b.built { | |||
return b.t, b.err | |||
} | |||
b.built = true | |||
b.t = &table{ | |||
Table: colltab.Table{ | |||
MaxContractLen: utf8.UTFMax, | |||
VariableTop: uint32(b.varTop), | |||
}, | |||
} | |||
b.buildOrdering(&b.root) | |||
b.t.root = b.root.handle | |||
for _, t := range b.locale { | |||
b.buildOrdering(t.index) | |||
if b.err != nil { | |||
break | |||
} | |||
} | |||
i, err := b.index.generate() | |||
b.t.trie = *i | |||
b.t.Index = colltab.Trie{ | |||
Index: i.index, | |||
Values: i.values, | |||
Index0: i.index[blockSize*b.t.root.lookupStart:], | |||
Values0: i.values[blockSize*b.t.root.valueStart:], | |||
} | |||
b.error(err) | |||
return b.t, b.err | |||
} | |||
// Build builds the root Collator. | |||
func (b *Builder) Build() (colltab.Weighter, error) { | |||
table, err := b.build() | |||
if err != nil { | |||
return nil, err | |||
} | |||
return table, nil | |||
} | |||
// Build builds a Collator for Tailoring t. | |||
func (t *Tailoring) Build() (colltab.Weighter, error) { | |||
// TODO: implement. | |||
return nil, nil | |||
} | |||
// Print prints the tables for b and all its Tailorings as a Go file | |||
// that can be included in the Collate package. | |||
func (b *Builder) Print(w io.Writer) (n int, err error) { | |||
p := func(nn int, e error) { | |||
n += nn | |||
if err == nil { | |||
err = e | |||
} | |||
} | |||
t, err := b.build() | |||
if err != nil { | |||
return 0, err | |||
} | |||
p(fmt.Fprintf(w, `var availableLocales = "und`)) | |||
for _, loc := range b.locale { | |||
if loc.id != "und" { | |||
p(fmt.Fprintf(w, ",%s", loc.id)) | |||
} | |||
} | |||
p(fmt.Fprint(w, "\"\n\n")) | |||
p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop)) | |||
p(fmt.Fprintln(w, "var locales = [...]tableIndex{")) | |||
for _, loc := range b.locale { | |||
if loc.id == "und" { | |||
p(t.fprintIndex(w, loc.index.handle, loc.id)) | |||
} | |||
} | |||
for _, loc := range b.locale { | |||
if loc.id != "und" { | |||
p(t.fprintIndex(w, loc.index.handle, loc.id)) | |||
} | |||
} | |||
p(fmt.Fprint(w, "}\n\n")) | |||
n, _, err = t.fprint(w, "main") | |||
return | |||
} | |||
// reproducibleFromNFKD checks whether the given expansion could be generated | |||
// from an NFKD expansion. | |||
func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool { | |||
// Length must be equal. | |||
if len(exp) != len(nfkd) { | |||
return false | |||
} | |||
for i, ce := range exp { | |||
// Primary and secondary values should be equal. | |||
if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] { | |||
return false | |||
} | |||
// Tertiary values should be equal to maxTertiary for third element onwards. | |||
// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can | |||
// simply be dropped. Try this out by dropping the following code. | |||
if i >= 2 && ce.w[2] != maxTertiary { | |||
return false | |||
} | |||
if _, err := makeCE(ce); err != nil { | |||
// Simply return false. The error will be caught elsewhere. | |||
return false | |||
} | |||
} | |||
return true | |||
} | |||
func simplify(o *ordering) { | |||
// Runes that are a starter of a contraction should not be removed. | |||
// (To date, there is only Kannada character 0CCA.) | |||
keep := make(map[rune]bool) | |||
for e := o.front(); e != nil; e, _ = e.nextIndexed() { | |||
if len(e.runes) > 1 { | |||
keep[e.runes[0]] = true | |||
} | |||
} | |||
// Tag entries for which the runes NFKD decompose to identical values. | |||
for e := o.front(); e != nil; e, _ = e.nextIndexed() { | |||
s := e.str | |||
nfkd := norm.NFKD.String(s) | |||
nfd := norm.NFD.String(s) | |||
if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd { | |||
continue | |||
} | |||
if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) { | |||
e.decompose = true | |||
} | |||
} | |||
} | |||
// appendExpansion converts the given collation sequence to | |||
// collation elements and adds them to the expansion table. | |||
// It returns an index to the expansion table. | |||
func (b *Builder) appendExpansion(e *entry) int { | |||
t := b.t | |||
i := len(t.ExpandElem) | |||
ce := uint32(len(e.elems)) | |||
t.ExpandElem = append(t.ExpandElem, ce) | |||
for _, w := range e.elems { | |||
ce, err := makeCE(w) | |||
if err != nil { | |||
b.error(err) | |||
return -1 | |||
} | |||
t.ExpandElem = append(t.ExpandElem, ce) | |||
} | |||
return i | |||
} | |||
// processExpansions extracts data necessary to generate | |||
// the extraction tables. | |||
func (b *Builder) processExpansions(o *ordering) { | |||
for e := o.front(); e != nil; e, _ = e.nextIndexed() { | |||
if !e.expansion() { | |||
continue | |||
} | |||
key := fmt.Sprintf("%v", e.elems) | |||
i, ok := b.expIndex[key] | |||
if !ok { | |||
i = b.appendExpansion(e) | |||
b.expIndex[key] = i | |||
} | |||
e.expansionIndex = i | |||
} | |||
} | |||
func (b *Builder) processContractions(o *ordering) { | |||
// Collate contractions per starter rune. | |||
starters := []rune{} | |||
cm := make(map[rune][]*entry) | |||
for e := o.front(); e != nil; e, _ = e.nextIndexed() { | |||
if e.contraction() { | |||
if len(e.str) > b.t.MaxContractLen { | |||
b.t.MaxContractLen = len(e.str) | |||
} | |||
r := e.runes[0] | |||
if _, ok := cm[r]; !ok { | |||
starters = append(starters, r) | |||
} | |||
cm[r] = append(cm[r], e) | |||
} | |||
} | |||
// Add entries of single runes that are at a start of a contraction. | |||
for e := o.front(); e != nil; e, _ = e.nextIndexed() { | |||
if !e.contraction() { | |||
r := e.runes[0] | |||
if _, ok := cm[r]; ok { | |||
cm[r] = append(cm[r], e) | |||
} | |||
} | |||
} | |||
// Build the tries for the contractions. | |||
t := b.t | |||
for _, r := range starters { | |||
l := cm[r] | |||
// Compute suffix strings. There are 31 different contraction suffix | |||
// sets for 715 contractions and 82 contraction starter runes as of | |||
// version 6.0.0. | |||
sufx := []string{} | |||
hasSingle := false | |||
for _, e := range l { | |||
if len(e.runes) > 1 { | |||
sufx = append(sufx, string(e.runes[1:])) | |||
} else { | |||
hasSingle = true | |||
} | |||
} | |||
if !hasSingle { | |||
b.error(fmt.Errorf("no single entry for starter rune %U found", r)) | |||
continue | |||
} | |||
// Unique the suffix set. | |||
sort.Strings(sufx) | |||
key := strings.Join(sufx, "\n") | |||
handle, ok := b.ctHandle[key] | |||
if !ok { | |||
var err error | |||
handle, err = appendTrie(&t.ContractTries, sufx) | |||
if err != nil { | |||
b.error(err) | |||
} | |||
b.ctHandle[key] = handle | |||
} | |||
// Bucket sort entries in index order. | |||
es := make([]*entry, len(l)) | |||
for _, e := range l { | |||
var p, sn int | |||
if len(e.runes) > 1 { | |||
str := []byte(string(e.runes[1:])) | |||
p, sn = lookup(&t.ContractTries, handle, str) | |||
if sn != len(str) { | |||
log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str)) | |||
} | |||
} | |||
if es[p] != nil { | |||
log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0]) | |||
} | |||
es[p] = e | |||
} | |||
// Create collation elements for contractions. | |||
elems := []uint32{} | |||
for _, e := range es { | |||
ce, err := e.encodeBase() | |||
b.errorID(o.id, err) | |||
elems = append(elems, ce) | |||
} | |||
key = fmt.Sprintf("%v", elems) | |||
i, ok := b.ctElem[key] | |||
if !ok { | |||
i = len(t.ContractElem) | |||
b.ctElem[key] = i | |||
t.ContractElem = append(t.ContractElem, elems...) | |||
} | |||
// Store info in entry for starter rune. | |||
es[0].contractionIndex = i | |||
es[0].contractionHandle = handle | |||
} | |||
} |
@@ -0,0 +1,294 @@ | |||
// Copyright 2012 The Go Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style | |||
// license that can be found in the LICENSE file. | |||
package build | |||
import ( | |||
"fmt" | |||
"unicode" | |||
"golang.org/x/text/internal/colltab" | |||
) | |||
const ( | |||
defaultSecondary = 0x20 | |||
defaultTertiary = 0x2 | |||
maxTertiary = 0x1F | |||
) | |||
type rawCE struct { | |||
w []int | |||
ccc uint8 | |||
} | |||
func makeRawCE(w []int, ccc uint8) rawCE { | |||
ce := rawCE{w: make([]int, 4), ccc: ccc} | |||
copy(ce.w, w) | |||
return ce | |||
} | |||
// A collation element is represented as an uint32. | |||
// In the typical case, a rune maps to a single collation element. If a rune | |||
// can be the start of a contraction or expands into multiple collation elements, | |||
// then the collation element that is associated with a rune will have a special | |||
// form to represent such m to n mappings. Such special collation elements | |||
// have a value >= 0x80000000. | |||
const ( | |||
maxPrimaryBits = 21 | |||
maxSecondaryBits = 12 | |||
maxTertiaryBits = 8 | |||
) | |||
func makeCE(ce rawCE) (uint32, error) { | |||
v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc) | |||
return uint32(v), e | |||
} | |||
// For contractions, collation elements are of the form | |||
// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where | |||
// - n* is the size of the first node in the contraction trie. | |||
// - i* is the index of the first node in the contraction trie. | |||
// - b* is the offset into the contraction collation element table. | |||
// See contract.go for details on the contraction trie. | |||
const ( | |||
contractID = 0xC0000000 | |||
maxNBits = 4 | |||
maxTrieIndexBits = 12 | |||
maxContractOffsetBits = 13 | |||
) | |||
func makeContractIndex(h ctHandle, offset int) (uint32, error) { | |||
if h.n >= 1<<maxNBits { | |||
return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits) | |||
} | |||
if h.index >= 1<<maxTrieIndexBits { | |||
return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits) | |||
} | |||
if offset >= 1<<maxContractOffsetBits { | |||
return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits) | |||
} | |||
ce := uint32(contractID) | |||
ce += uint32(offset << (maxNBits + maxTrieIndexBits)) | |||
ce += uint32(h.index << maxNBits) | |||
ce += uint32(h.n) | |||
return ce, nil | |||
} | |||
// For expansions, collation elements are of the form | |||
// 11100000 00000000 bbbbbbbb bbbbbbbb, | |||
// where b* is the index into the expansion sequence table. | |||
const ( | |||
expandID = 0xE0000000 | |||
maxExpandIndexBits = 16 | |||
) | |||
func makeExpandIndex(index int) (uint32, error) { | |||
if index >= 1<<maxExpandIndexBits { | |||
return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits) | |||
} | |||
return expandID + uint32(index), nil | |||
} | |||
// Each list of collation elements corresponding to an expansion starts with | |||
// a header indicating the length of the sequence. | |||
func makeExpansionHeader(n int) (uint32, error) { | |||
return uint32(n), nil | |||
} | |||
// Some runes can be expanded using NFKD decomposition. Instead of storing the full | |||
// sequence of collation elements, we decompose the rune and lookup the collation | |||
// elements for each rune in the decomposition and modify the tertiary weights. | |||
// The collation element, in this case, is of the form | |||
// 11110000 00000000 wwwwwwww vvvvvvvv, where | |||
// - v* is the replacement tertiary weight for the first rune, | |||
// - w* is the replacement tertiary weight for the second rune, | |||
// Tertiary weights of subsequent runes should be replaced with maxTertiary. | |||
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. | |||
const ( | |||
decompID = 0xF0000000 | |||
) | |||
func makeDecompose(t1, t2 int) (uint32, error) { | |||
if t1 >= 256 || t1 < 0 { | |||
return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1) | |||
} | |||
if t2 >= 256 || t2 < 0 { | |||
return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2) | |||
} | |||
return uint32(t2<<8+t1) + decompID, nil | |||
} | |||
const ( | |||
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. | |||
minUnified rune = 0x4E00 | |||
maxUnified = 0x9FFF | |||
minCompatibility = 0xF900 | |||
maxCompatibility = 0xFAFF | |||
minRare = 0x3400 | |||
maxRare = 0x4DBF | |||
) | |||
const ( | |||
commonUnifiedOffset = 0x10000 | |||
rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF | |||
otherOffset = 0x50000 // largest rune in rare is U+2FA1D | |||
illegalOffset = otherOffset + int(unicode.MaxRune) | |||
maxPrimary = illegalOffset + 1 | |||
) | |||
// implicitPrimary returns the primary weight for the a rune | |||
// for which there is no entry for the rune in the collation table. | |||
// We take a different approach from the one specified in | |||
// http://unicode.org/reports/tr10/#Implicit_Weights, | |||
// but preserve the resulting relative ordering of the runes. | |||
func implicitPrimary(r rune) int { | |||
if unicode.Is(unicode.Ideographic, r) { | |||
if r >= minUnified && r <= maxUnified { | |||
// The most common case for CJK. | |||
return int(r) + commonUnifiedOffset | |||
} | |||
if r >= minCompatibility && r <= maxCompatibility { | |||
// This will typically not hit. The DUCET explicitly specifies mappings | |||
// for all characters that do not decompose. | |||
return int(r) + commonUnifiedOffset | |||
} | |||
return int(r) + rareUnifiedOffset | |||
} | |||
return int(r) + otherOffset | |||
} | |||
// convertLargeWeights converts collation elements with large | |||
// primaries (either double primaries or for illegal runes) | |||
// to our own representation. | |||
// A CJK character C is represented in the DUCET as | |||
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] | |||
// We will rewrite these characters to a single CE. | |||
// We assume the CJK values start at 0x8000. | |||
// See http://unicode.org/reports/tr10/#Implicit_Weights | |||
func convertLargeWeights(elems []rawCE) (res []rawCE, err error) { | |||
const ( | |||
cjkPrimaryStart = 0xFB40 | |||
rarePrimaryStart = 0xFB80 | |||
otherPrimaryStart = 0xFBC0 | |||
illegalPrimary = 0xFFFE | |||
highBitsMask = 0x3F | |||
lowBitsMask = 0x7FFF | |||
lowBitsFlag = 0x8000 | |||
shiftBits = 15 | |||
) | |||
for i := 0; i < len(elems); i++ { | |||
ce := elems[i].w | |||
p := ce[0] | |||
if p < cjkPrimaryStart { | |||
continue | |||
} | |||
if p > 0xFFFF { | |||
return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) | |||
} | |||
if p >= illegalPrimary { | |||
ce[0] = illegalOffset + p - illegalPrimary | |||
} else { | |||
if i+1 >= len(elems) { | |||
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) | |||
} | |||
if elems[i+1].w[0]&lowBitsFlag == 0 { | |||
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) | |||
} | |||
np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask | |||
switch { | |||
case p < rarePrimaryStart: | |||
np += commonUnifiedOffset | |||
case p < otherPrimaryStart: | |||
np += rareUnifiedOffset | |||
default: | |||
p += otherOffset | |||
} | |||
ce[0] = np | |||
for j := i + 1; j+1 < len(elems); j++ { | |||
elems[j] = elems[j+1] | |||
} | |||
elems = elems[:len(elems)-1] | |||
} | |||
} | |||
return elems, nil | |||
} | |||
// nextWeight computes the first possible collation weights following elems | |||
// for the given level. | |||
func nextWeight(level colltab.Level, elems []rawCE) []rawCE { | |||
if level == colltab.Identity { | |||
next := make([]rawCE, len(elems)) | |||
copy(next, elems) | |||
return next | |||
} | |||
next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)} | |||
next[0].w[level]++ | |||
if level < colltab.Secondary { | |||
next[0].w[colltab.Secondary] = defaultSecondary | |||
} | |||
if level < colltab.Tertiary { | |||
next[0].w[colltab.Tertiary] = defaultTertiary | |||
} | |||
// Filter entries that cannot influence ordering. | |||
for _, ce := range elems[1:] { | |||
skip := true | |||
for i := colltab.Primary; i < level; i++ { | |||
skip = skip && ce.w[i] == 0 | |||
} | |||
if !skip { | |||
next = append(next, ce) | |||
} | |||
} | |||
return next | |||
} | |||
func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) { | |||
for ; i < len(elems) && elems[i].w[level] == 0; i++ { | |||
} | |||
if i < len(elems) { | |||
return i, elems[i].w[level] | |||
} | |||
return i, 0 | |||
} | |||
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise. | |||
// It also returns the collation level at which the difference is found. | |||
func compareWeights(a, b []rawCE) (result int, level colltab.Level) { | |||
for level := colltab.Primary; level < colltab.Identity; level++ { | |||
var va, vb int | |||
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 { | |||
ia, va = nextVal(a, ia, level) | |||
ib, vb = nextVal(b, ib, level) | |||
if va != vb { | |||
if va < vb { | |||
return -1, level | |||
} else { | |||
return 1, level | |||
} | |||
} | |||
} | |||
} | |||
return 0, colltab.Identity | |||
} | |||
func equalCE(a, b rawCE) bool { | |||
for i := 0; i < 3; i++ { | |||
if b.w[i] != a.w[i] { | |||
return false | |||
} | |||
} | |||
return true | |||
} | |||
func equalCEArrays(a, b []rawCE) bool { | |||
if len(a) != len(b) { | |||
return false | |||
} | |||
for i := range a { | |||
if !equalCE(a[i], b[i]) { | |||
return false | |||
} | |||
} | |||
return true | |||
} |
@@ -0,0 +1,309 @@ | |||
// Copyright 2012 The Go Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style | |||
// license that can be found in the LICENSE file. | |||
package build | |||
import ( | |||
"fmt" | |||
"io" | |||
"reflect" | |||
"sort" | |||
"strings" | |||
"golang.org/x/text/internal/colltab" | |||
) | |||
// This file contains code for detecting contractions and generating | |||
// the necessary tables. | |||
// Any Unicode Collation Algorithm (UCA) table entry that has more than | |||
// one rune one the left-hand side is called a contraction. | |||
// See http://www.unicode.org/reports/tr10/#Contractions for more details. | |||
// | |||
// We define the following terms: | |||
// initial: a rune that appears as the first rune in a contraction. | |||
// suffix: a sequence of runes succeeding the initial rune | |||
// in a given contraction. | |||
// non-initial: a rune that appears in a suffix. | |||
// | |||
// A rune may be both an initial and a non-initial and may be so in | |||
// many contractions. An initial may typically also appear by itself. | |||
// In case of ambiguities, the UCA requires we match the longest | |||
// contraction. | |||
// | |||
// Many contraction rules share the same set of possible suffixes. | |||
// We store sets of suffixes in a trie that associates an index with | |||
// each suffix in the set. This index can be used to look up a | |||
// collation element associated with the (starter rune, suffix) pair. | |||
// | |||
// The trie is defined on a UTF-8 byte sequence. | |||
// The overall trie is represented as an array of ctEntries. Each node of the trie | |||
// is represented as a subsequence of ctEntries, where each entry corresponds to | |||
// a possible match of a next character in the search string. An entry | |||
// also includes the length and offset to the next sequence of entries | |||
// to check in case of a match. | |||
const ( | |||
final = 0 | |||
noIndex = 0xFF | |||
) | |||
// ctEntry associates to a matching byte an offset and/or next sequence of | |||
// bytes to check. A ctEntry c is called final if a match means that the | |||
// longest suffix has been found. An entry c is final if c.N == 0. | |||
// A single final entry can match a range of characters to an offset. | |||
// A non-final entry always matches a single byte. Note that a non-final | |||
// entry might still resemble a completed suffix. | |||
// Examples: | |||
// The suffix strings "ab" and "ac" can be represented as: | |||
// []ctEntry{ | |||
// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. | |||
// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 | |||
// } | |||
// | |||
// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: | |||
// []ctEntry{ | |||
// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. | |||
// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. | |||
// {'d', 'd', final, 3}, // "abd" -> 3 | |||
// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. | |||
// {'d', 'd', final, 4}, // "abcd" -> 4 | |||
// } | |||
// See genStateTests in contract_test.go for more examples. | |||
type ctEntry struct { | |||
L uint8 // non-final: byte value to match; final: lowest match in range. | |||
H uint8 // non-final: relative index to next block; final: highest match in range. | |||
N uint8 // non-final: length of next block; final: final | |||
I uint8 // result offset. Will be noIndex if more bytes are needed to complete. | |||
} | |||
// contractTrieSet holds a set of contraction tries. The tries are stored | |||
// consecutively in the entry field. | |||
type contractTrieSet []struct{ l, h, n, i uint8 } | |||
// ctHandle is used to identify a trie in the trie set, consisting in an offset | |||
// in the array and the size of the first node. | |||
type ctHandle struct { | |||
index, n int | |||
} | |||
// appendTrie adds a new trie for the given suffixes to the trie set and returns | |||
// a handle to it. The handle will be invalid on error. | |||
func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) { | |||
es := make([]stridx, len(suffixes)) | |||
for i, s := range suffixes { | |||
es[i].str = s | |||
} | |||
sort.Sort(offsetSort(es)) | |||
for i := range es { | |||
es[i].index = i + 1 | |||
} | |||
sort.Sort(genidxSort(es)) | |||
i := len(*ct) | |||
n, err := genStates(ct, es) | |||
if err != nil { | |||
*ct = (*ct)[:i] | |||
return ctHandle{}, err | |||
} | |||
return ctHandle{i, n}, nil | |||
} | |||
// genStates generates ctEntries for a given suffix set and returns | |||
// the number of entries for the first node. | |||
func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) { | |||
if len(sis) == 0 { | |||
return 0, fmt.Errorf("genStates: list of suffices must be non-empty") | |||
} | |||
start := len(*ct) | |||
// create entries for differing first bytes. | |||
for _, si := range sis { | |||
s := si.str | |||
if len(s) == 0 { | |||
continue | |||
} | |||
added := false | |||
c := s[0] | |||
if len(s) > 1 { | |||
for j := len(*ct) - 1; j >= start; j-- { | |||
if (*ct)[j].L == c { | |||
added = true | |||
break | |||
} | |||
} | |||
if !added { | |||
*ct = append(*ct, ctEntry{L: c, I: noIndex}) | |||
} | |||
} else { | |||
for j := len(*ct) - 1; j >= start; j-- { | |||
// Update the offset for longer suffixes with the same byte. | |||
if (*ct)[j].L == c { | |||
(*ct)[j].I = uint8(si.index) | |||
added = true | |||
} | |||
// Extend range of final ctEntry, if possible. | |||
if (*ct)[j].H+1 == c { | |||
(*ct)[j].H = c | |||
added = true | |||
} | |||
} | |||
if !added { | |||
*ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)}) | |||
} | |||
} | |||
} | |||
n := len(*ct) - start | |||
// Append nodes for the remainder of the suffixes for each ctEntry. | |||
sp := 0 | |||
for i, end := start, len(*ct); i < end; i++ { | |||
fe := (*ct)[i] | |||
if fe.H == 0 { // uninitialized non-final | |||
ln := len(*ct) - start - n | |||
if ln > 0xFF { | |||
return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) | |||
} | |||
fe.H = uint8(ln) | |||
// Find first non-final strings with same byte as current entry. | |||
for ; sis[sp].str[0] != fe.L; sp++ { | |||
} | |||
se := sp + 1 | |||
for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ { | |||
} | |||
sl := sis[sp:se] | |||
sp = se | |||
for i, si := range sl { | |||
sl[i].str = si.str[1:] | |||
} | |||
nn, err := genStates(ct, sl) | |||
if err != nil { | |||
return 0, err | |||
} | |||
fe.N = uint8(nn) | |||
(*ct)[i] = fe | |||
} | |||
} | |||
sort.Sort(entrySort((*ct)[start : start+n])) | |||
return n, nil | |||
} | |||
// There may be both a final and non-final entry for a byte if the byte | |||
// is implied in a range of matches in the final entry. | |||
// We need to ensure that the non-final entry comes first in that case. | |||
type entrySort colltab.ContractTrieSet | |||
func (fe entrySort) Len() int { return len(fe) } | |||
func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } | |||
func (fe entrySort) Less(i, j int) bool { | |||
return fe[i].L > fe[j].L | |||
} | |||
// stridx is used for sorting suffixes and their associated offsets. | |||
type stridx struct { | |||
str string | |||
index int | |||
} | |||
// For computing the offsets, we first sort by size, and then by string. | |||
// This ensures that strings that only differ in the last byte by 1 | |||
// are sorted consecutively in increasing order such that they can | |||
// be packed as a range in a final ctEntry. | |||
type offsetSort []stridx | |||
func (si offsetSort) Len() int { return len(si) } | |||
func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } | |||
func (si offsetSort) Less(i, j int) bool { | |||
if len(si[i].str) != len(si[j].str) { | |||
return len(si[i].str) > len(si[j].str) | |||
} | |||
return si[i].str < si[j].str | |||
} | |||
// For indexing, we want to ensure that strings are sorted in string order, where | |||
// for strings with the same prefix, we put longer strings before shorter ones. | |||
type genidxSort []stridx | |||
func (si genidxSort) Len() int { return len(si) } | |||
func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } | |||
func (si genidxSort) Less(i, j int) bool { | |||
if strings.HasPrefix(si[j].str, si[i].str) { | |||
return false | |||
} | |||
if strings.HasPrefix(si[i].str, si[j].str) { | |||
return true | |||
} | |||
return si[i].str < si[j].str | |||
} | |||
// lookup matches the longest suffix in str and returns the associated offset | |||
// and the number of bytes consumed. | |||
func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) { | |||
states := (*ct)[h.index:] | |||
p := 0 | |||
n := h.n | |||
for i := 0; i < n && p < len(str); { | |||
e := states[i] | |||
c := str[p] | |||
if c >= e.L { | |||
if e.L == c { | |||
p++ | |||
if e.I != noIndex { | |||
index, ns = int(e.I), p | |||
} | |||
if e.N != final { | |||
// set to new state | |||
i, states, n = 0, states[int(e.H)+n:], int(e.N) | |||
} else { | |||
return | |||
} | |||
continue | |||
} else if e.N == final && c <= e.H { | |||
p++ | |||
return int(c-e.L) + int(e.I), p | |||
} | |||
} | |||
i++ | |||
} | |||
return | |||
} | |||
// print writes the contractTrieSet t as compilable Go code to w. It returns | |||
// the total number of bytes written and the size of the resulting data structure in bytes. | |||
func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { | |||
update3 := func(nn, sz int, e error) { | |||
n += nn | |||
if err == nil { | |||
err = e | |||
} | |||
size += sz | |||
} | |||
update2 := func(nn int, e error) { update3(nn, 0, e) } | |||
update3(printArray(*t, w, name)) | |||
update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) | |||
update3(printStruct(*t, w, name)) | |||
update2(fmt.Fprintln(w)) | |||
return | |||
} | |||
func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { | |||
p := func(f string, a ...interface{}) { | |||
nn, e := fmt.Fprintf(w, f, a...) | |||
n += nn | |||
if err == nil { | |||
err = e | |||
} | |||
} | |||
size = len(ct) * 4 | |||
p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) | |||
p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct)) | |||
for _, fe := range ct { | |||
p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I) | |||
} | |||
p("}\n") | |||
return | |||
} | |||
func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { | |||
n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name) | |||
size = int(reflect.TypeOf(ct).Size()) | |||
return | |||
} |
@@ -0,0 +1,393 @@ | |||
// Copyright 2012 The Go Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style | |||
// license that can be found in the LICENSE file. | |||
package build | |||
import ( | |||
"fmt" | |||
"log" | |||
"sort" | |||
"strings" | |||
"unicode" | |||
"golang.org/x/text/internal/colltab" | |||
"golang.org/x/text/unicode/norm" | |||
) | |||
type logicalAnchor int | |||
const ( | |||
firstAnchor logicalAnchor = -1 | |||
noAnchor = 0 | |||
lastAnchor = 1 | |||
) | |||
// entry is used to keep track of a single entry in the collation element table | |||
// during building. Examples of entries can be found in the Default Unicode | |||
// Collation Element Table. | |||
// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. | |||
type entry struct { | |||
str string // same as string(runes) | |||
runes []rune | |||
elems []rawCE // the collation elements | |||
extend string // weights of extend to be appended to elems | |||
before bool // weights relative to next instead of previous. | |||
lock bool // entry is used in extension and can no longer be moved. | |||
// prev, next, and level are used to keep track of tailorings. | |||
prev, next *entry | |||
level colltab.Level // next differs at this level | |||
skipRemove bool // do not unlink when removed | |||
decompose bool // can use NFKD decomposition to generate elems | |||
exclude bool // do not include in table | |||
implicit bool // derived, is not included in the list | |||
modified bool // entry was modified in tailoring | |||
logical logicalAnchor | |||
expansionIndex int // used to store index into expansion table | |||
contractionHandle ctHandle | |||
contractionIndex int // index into contraction elements | |||
} | |||
func (e *entry) String() string { | |||
return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)", | |||
e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) | |||
} | |||
func (e *entry) skip() bool { | |||
return e.contraction() | |||
} | |||
func (e *entry) expansion() bool { | |||
return !e.decompose && len(e.elems) > 1 | |||
} | |||
func (e *entry) contraction() bool { | |||
return len(e.runes) > 1 | |||
} | |||
func (e *entry) contractionStarter() bool { | |||
return e.contractionHandle.n != 0 | |||
} | |||
// nextIndexed gets the next entry that needs to be stored in the table. | |||
// It returns the entry and the collation level at which the next entry differs | |||
// from the current entry. | |||
// Entries that can be explicitly derived and logical reset positions are | |||
// examples of entries that will not be indexed. | |||
func (e *entry) nextIndexed() (*entry, colltab.Level) { | |||
level := e.level | |||
for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next { | |||
if e.level < level { | |||
level = e.level | |||
} | |||
} | |||
return e, level | |||
} | |||
// remove unlinks entry e from the sorted chain and clears the collation | |||
// elements. e may not be at the front or end of the list. This should always | |||
// be the case, as the front and end of the list are always logical anchors, | |||
// which may not be removed. | |||
func (e *entry) remove() { | |||
if e.logical != noAnchor { | |||
log.Fatalf("may not remove anchor %q", e.str) | |||
} | |||
// TODO: need to set e.prev.level to e.level if e.level is smaller? | |||
e.elems = nil | |||
if !e.skipRemove { | |||
if e.prev != nil { | |||
e.prev.next = e.next | |||
} | |||
if e.next != nil { | |||
e.next.prev = e.prev | |||
} | |||
} | |||
e.skipRemove = false | |||
} | |||
// insertAfter inserts n after e. | |||
func (e *entry) insertAfter(n *entry) { | |||
if e == n { | |||
panic("e == anchor") | |||
} | |||
if e == nil { | |||
panic("unexpected nil anchor") | |||
} | |||
n.remove() | |||
n.decompose = false // redo decomposition test | |||
n.next = e.next | |||
n.prev = e | |||
if e.next != nil { | |||
e.next.prev = n | |||
} | |||
e.next = n | |||
} | |||
// insertBefore inserts n before e. | |||
func (e *entry) insertBefore(n *entry) { | |||
if e == n { | |||
panic("e == anchor") | |||
} | |||
if e == nil { | |||
panic("unexpected nil anchor") | |||
} | |||
n.remove() | |||
n.decompose = false // redo decomposition test | |||
n.prev = e.prev | |||
n.next = e | |||
if e.prev != nil { | |||
e.prev.next = n | |||
} | |||
e.prev = n | |||
} | |||
func (e *entry) encodeBase() (ce uint32, err error) { | |||
switch { | |||
case e.expansion(): | |||
ce, err = makeExpandIndex(e.expansionIndex) | |||
default: | |||
if e.decompose { | |||
log.Fatal("decompose should be handled elsewhere") | |||
} | |||
ce, err = makeCE(e.elems[0]) | |||
} | |||
return | |||
} | |||
func (e *entry) encode() (ce uint32, err error) { | |||
if e.skip() { | |||
log.Fatal("cannot build colElem for entry that should be skipped") | |||
} | |||
switch { | |||
case e.decompose: | |||
t1 := e.elems[0].w[2] | |||
t2 := 0 | |||
if len(e.elems) > 1 { | |||
t2 = e.elems[1].w[2] | |||
} | |||
ce, err = makeDecompose(t1, t2) | |||
case e.contractionStarter(): | |||
ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) | |||
default: | |||
if len(e.runes) > 1 { | |||
log.Fatal("colElem: contractions are handled in contraction trie") | |||
} | |||
ce, err = e.encodeBase() | |||
} | |||
return | |||
} | |||
// entryLess returns true if a sorts before b and false otherwise. | |||
func entryLess(a, b *entry) bool { | |||
if res, _ := compareWeights(a.elems, b.elems); res != 0 { | |||
return res == -1 | |||
} | |||
if a.logical != noAnchor { | |||
return a.logical == firstAnchor | |||
} | |||
if b.logical != noAnchor { | |||
return b.logical == lastAnchor | |||
} | |||
return a.str < b.str | |||
} | |||
type sortedEntries []*entry | |||
func (s sortedEntries) Len() int { | |||
return len(s) | |||
} | |||
func (s sortedEntries) Swap(i, j int) { | |||
s[i], s[j] = s[j], s[i] | |||
} | |||
func (s sortedEntries) Less(i, j int) bool { | |||
return entryLess(s[i], s[j]) | |||
} | |||
type ordering struct { | |||
id string | |||
entryMap map[string]*entry | |||
ordered []*entry | |||
handle *trieHandle | |||
} | |||
// insert inserts e into both entryMap and ordered. | |||
// Note that insert simply appends e to ordered. To reattain a sorted | |||
// order, o.sort() should be called. | |||
func (o *ordering) insert(e *entry) { | |||
if e.logical == noAnchor { | |||
o.entryMap[e.str] = e | |||
} else { | |||
// Use key format as used in UCA rules. | |||
o.entryMap[fmt.Sprintf("[%s]", e.str)] = e | |||
// Also add index entry for XML format. | |||
o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e | |||
} | |||
o.ordered = append(o.ordered, e) | |||
} | |||
// newEntry creates a new entry for the given info and inserts it into | |||
// the index. | |||
func (o *ordering) newEntry(s string, ces []rawCE) *entry { | |||
e := &entry{ | |||
runes: []rune(s), | |||
elems: ces, | |||
str: s, | |||
} | |||
o.insert(e) | |||
return e | |||
} | |||
// find looks up and returns the entry for the given string. | |||
// It returns nil if str is not in the index and if an implicit value | |||
// cannot be derived, that is, if str represents more than one rune. | |||
func (o *ordering) find(str string) *entry { | |||
e := o.entryMap[str] | |||
if e == nil { | |||
r := []rune(str) | |||
if len(r) == 1 { | |||
const ( | |||
firstHangul = 0xAC00 | |||
lastHangul = 0xD7A3 | |||
) | |||
if r[0] >= firstHangul && r[0] <= lastHangul { | |||
ce := []rawCE{} | |||
nfd := norm.NFD.String(str) | |||
for _, r := range nfd { | |||
ce = append(ce, o.find(string(r)).elems...) | |||
} | |||
e = o.newEntry(nfd, ce) | |||
} else { | |||
e = o.newEntry(string(r[0]), []rawCE{ | |||
{w: []int{ | |||
implicitPrimary(r[0]), | |||
defaultSecondary, | |||
defaultTertiary, | |||
int(r[0]), | |||
}, | |||
}, | |||
}) | |||
e.modified = true | |||
} | |||
e.exclude = true // do not index implicits | |||
} | |||
} | |||
return e | |||
} | |||
// makeRootOrdering returns a newly initialized ordering value and populates | |||
// it with a set of logical reset points that can be used as anchors. | |||
// The anchors first_tertiary_ignorable and __END__ will always sort at | |||
// the beginning and end, respectively. This means that prev and next are non-nil | |||
// for any indexed entry. | |||
func makeRootOrdering() ordering { | |||
const max = unicode.MaxRune | |||
o := ordering{ | |||
entryMap: make(map[string]*entry), | |||
} | |||
insert := func(typ logicalAnchor, s string, ce []int) { | |||
e := &entry{ | |||
elems: []rawCE{{w: ce}}, | |||
str: s, | |||
exclude: true, | |||
logical: typ, | |||
} | |||
o.insert(e) | |||
} | |||
insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0}) | |||
insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max}) | |||
insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max}) | |||
insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max}) | |||
insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max}) | |||
return o | |||
} | |||
// patchForInsert eleminates entries from the list with more than one collation element. | |||
// The next and prev fields of the eliminated entries still point to appropriate | |||
// values in the newly created list. | |||
// It requires that sort has been called. | |||
func (o *ordering) patchForInsert() { | |||
for i := 0; i < len(o.ordered)-1; { | |||
e := o.ordered[i] | |||
lev := e.level | |||
n := e.next | |||
for ; n != nil && len(n.elems) > 1; n = n.next { | |||
if n.level < lev { | |||
lev = n.level | |||
} | |||
n.skipRemove = true | |||
} | |||
for ; o.ordered[i] != n; i++ { | |||
o.ordered[i].level = lev | |||
o.ordered[i].next = n | |||
o.ordered[i+1].prev = e | |||
} | |||
} | |||
} | |||
// clone copies all ordering of es into a new ordering value. | |||
func (o *ordering) clone() *ordering { | |||
o.sort() | |||
oo := ordering{ | |||
entryMap: make(map[string]*entry), | |||
} | |||
for _, e := range o.ordered { | |||
ne := &entry{ | |||
runes: e.runes, | |||
elems: e.elems, | |||
str: e.str, | |||
decompose: e.decompose, | |||
exclude: e.exclude, | |||
logical: e.logical, | |||
} | |||
oo.insert(ne) | |||
} | |||
oo.sort() // link all ordering. | |||
oo.patchForInsert() | |||
return &oo | |||
} | |||
// front returns the first entry to be indexed. | |||
// It assumes that sort() has been called. | |||
func (o *ordering) front() *entry { | |||
e := o.ordered[0] | |||
if e.prev != nil { | |||
log.Panicf("unexpected first entry: %v", e) | |||
} | |||
// The first entry is always a logical position, which should not be indexed. | |||
e, _ = e.nextIndexed() | |||
return e | |||
} | |||
// sort sorts all ordering based on their collation elements and initializes | |||
// the prev, next, and level fields accordingly. | |||
func (o *ordering) sort() { | |||
sort.Sort(sortedEntries(o.ordered)) | |||
l := o.ordered | |||
for i := 1; i < len(l); i++ { | |||
k := i - 1 | |||
l[k].next = l[i] | |||
_, l[k].level = compareWeights(l[k].elems, l[i].elems) | |||
l[i].prev = l[k] | |||
} | |||
} | |||
// genColElems generates a collation element array from the runes in str. This | |||
// assumes that all collation elements have already been added to the Builder. | |||
func (o *ordering) genColElems(str string) []rawCE { | |||
elems := []rawCE{} | |||
for _, r := range []rune(str) { | |||
for _, ce := range o.find(string(r)).elems { | |||
if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 { | |||
elems = append(elems, ce) | |||
} | |||
} | |||
} | |||
return elems | |||
} |