From 87f84090686d269243d91aafc9efe9695f4a3d90 Mon Sep 17 00:00:00 2001 From: Iwasaki Yudai Date: Sun, 16 Aug 2015 23:46:35 -0700 Subject: [PATCH] Replace UTF8 bounds check with utf8reader --- Godeps/Godeps.json | 4 + .../src/github.com/yudai/utf8reader/README.md | 65 +++++++++++++++ .../github.com/yudai/utf8reader/utf8reader.go | 56 +++++++++++++ .../yudai/utf8reader/utf8reader_test.go | 79 +++++++++++++++++++ app/app.go | 34 ++------ 5 files changed, 209 insertions(+), 29 deletions(-) create mode 100644 Godeps/_workspace/src/github.com/yudai/utf8reader/README.md create mode 100644 Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader.go create mode 100644 Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader_test.go diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 0772966..00e3f13 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -19,6 +19,10 @@ "ImportPath": "github.com/kr/pty", "Comment": "release.r56-28-g5cf931e", "Rev": "5cf931ef8f76dccd0910001d74a58a7fca84a83d" + }, + { + "ImportPath": "github.com/yudai/utf8reader", + "Rev": "543610cf49fc1279921d6ef1b7f7773ebece14f8" } ] } diff --git a/Godeps/_workspace/src/github.com/yudai/utf8reader/README.md b/Godeps/_workspace/src/github.com/yudai/utf8reader/README.md new file mode 100644 index 0000000..64e419a --- /dev/null +++ b/Godeps/_workspace/src/github.com/yudai/utf8reader/README.md @@ -0,0 +1,65 @@ +# UTF8Reader for Go + +UTF8Reader is a simple wrapper Reader that fills the given buffer with a "tail-safe" UTF8 byte sequence. + +## Tail-Safe? + +Let's say you have a buffer of 7 bytes and your Reader is going to fill your buffer with a UTF8 byte sequence. + +```go +buf := make([]byte, 7) +reader := strings.NewReader("いろは") + +reader.Read(buf) +``` + +The byte length of UTF8 characters is not fixed and some characters like the examples above have 3 byte length. There are others which have a single byte, 2 byte and 4 byte length as well. This means your buffer will be sometimes filled with incomplete bytes as an Unicode character at the tail. + +By `reader.Read(buf)`, your `buf` will be like below: + +```go +[]byte{ + // い + byte(0xe3), // 1 + byte(0x81), // 2 + byte(0x84), // 3 + // ろ + byte(0xe3), // 4 + byte(0x82), // 5 + byte(0x8d), // 6 + // は (incomplete) + byte(0xe3), // 7 +} +``` + +The last character `は` is incomplete and the buffer is now invalid as a UTF8 string. + +UTF8Reader detects incomplete bytes like above and aborts filling up the buffer in such cases. + +```go +buf := make([]byte, 7) +reader := strings.NewReader("いろは") +utfReader := utf8reader.New(reader) + +utfReader.Read(buf) +``` + +Then you will get: + +```go +[]byte{ + // い + byte(0xe3), // 1 + byte(0x81), // 2 + byte(0x84), // 3 + // ろ + byte(0xe3), // 4 + byte(0x82), // 5 + byte(0x8d), // 6 +} +``` +Of course, bytes left behind will be used to fill up the buffer on next `Read()`. + +## Note + +UTF8Reader just checks incomplete bytes at the tail of the buffer. Even if the original byte sequence given to UTF8Reader is broken, UTF8Reader reports no errors and just fills up the buffer. diff --git a/Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader.go b/Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader.go new file mode 100644 index 0000000..5d2057e --- /dev/null +++ b/Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader.go @@ -0,0 +1,56 @@ +package utf8reader + +import ( + "bytes" + "errors" + "io" + "unicode/utf8" +) + +var SmallBufferError = errors.New("Buffer size must be larger than utf8.UTFMax.") + +type UTF8Reader struct { + reader io.Reader + buffer *bytes.Buffer +} + +func New(reader io.Reader) *UTF8Reader { + return &UTF8Reader{ + reader: reader, + buffer: bytes.NewBuffer(make([]byte, 0)), + } +} + +func (r *UTF8Reader) Read(p []byte) (n int, err error) { + size := 0 + + if cap(p) < utf8.UTFMax { + return size, SmallBufferError + } + + if r.buffer.Len() > 0 { + n, err = r.buffer.Read(p) + size += n + if err != nil { + return size, err + } + } + + n, err = r.reader.Read(p[size:]) + size += n + if err != nil { + return size, err + } + + leftOver := 0 + for ; leftOver < utf8.UTFMax; leftOver++ { + rune, _ := utf8.DecodeLastRune(p[:size-leftOver]) + if rune != utf8.RuneError { + break + } + } + + r.buffer.Write(p[size-leftOver : size]) + + return size - leftOver, nil +} diff --git a/Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader_test.go b/Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader_test.go new file mode 100644 index 0000000..2e0e01b --- /dev/null +++ b/Godeps/_workspace/src/github.com/yudai/utf8reader/utf8reader_test.go @@ -0,0 +1,79 @@ +package utf8reader + +import ( + "testing" + + "bytes" + "strings" +) + +func TestRead(t *testing.T) { + str := "日本語" + or := strings.NewReader(str) + r := New(or) + + buf := make([]byte, 512) + n, err := r.Read(buf) + if err != nil { + t.Errorf("Unexpected error") + } + if bytes.Compare(buf[:n], []byte(str)) != 0 { + t.Errorf("Failed to read bytes") + } + + n, err = r.Read(buf) + if err.Error() != "EOF" { + t.Errorf("Unexpected error") + } + + // 3 byte runes + str = "いろはにほ" + or = strings.NewReader(str) + r = New(or) + buf = make([]byte, 7) // 7 % 3 = 1 + + n, err = r.Read(buf) + if err != nil { + t.Errorf("Unexpected error") + } + if n != 6 { + t.Errorf("Read length error") + } + if bytes.Compare(buf[:n], []byte(str)[:6]) != 0 { + t.Errorf("Failed to read bytes") + } + + n, err = r.Read(buf) + if err != nil { + t.Errorf("Unexpected error") + } + if n != 6 { + t.Errorf("Read length error") + } + if bytes.Compare(buf[:n], []byte(str)[6:12]) != 0 { + t.Errorf("Failed to read bytes") + } + + n, err = r.Read(buf) + if err != nil { + t.Errorf("Unexpected error") + } + if n != 3 { + t.Errorf("Read length error") + } + if bytes.Compare(buf[:n], []byte(str)[12:15]) != 0 { + t.Errorf("Failed to read bytes") + } +} + +func TestReadWithSmallBuffer(t *testing.T) { + str := "日本語" + or := strings.NewReader(str) + r := New(or) + + buf := make([]byte, 2) // too small + _, err := r.Read(buf) + if err != SmallBufferError { + t.Errorf("Expected error were not returned") + } +} diff --git a/app/app.go b/app/app.go index 99c3a2a..92d1624 100644 --- a/app/app.go +++ b/app/app.go @@ -7,12 +7,12 @@ import ( "os/exec" "strings" "syscall" - "unicode/utf8" "unsafe" "github.com/elazarl/go-bindata-assetfs" "github.com/gorilla/websocket" "github.com/kr/pty" + "github.com/yudai/utf8reader" ) type App struct { @@ -83,11 +83,10 @@ func (app *App) generateHandler() func(w http.ResponseWriter, r *http.Request) { defer func() { exit <- true }() buf := make([]byte, 1024) - leftOver := 0 - for { - size, err := fio.Read(buf[leftOver:]) - size += leftOver + utf8f := utf8reader.New(fio) + for { + size, err := utf8f.Read(buf) if err != nil { log.Printf("command exited for: %s", r.RemoteAddr) return @@ -98,31 +97,8 @@ func (app *App) generateHandler() func(w http.ResponseWriter, r *http.Request) { return } - // UTF-8 Boundary check - for leftOver = 0; leftOver < utf8.UTFMax; leftOver++ { - re, _ := utf8.DecodeLastRune( - buf[:size-leftOver], - ) - - if re != utf8.RuneError { - break - } - // Invalid UTF rune - } - - if leftOver == utf8.UTFMax-1 { - re, _ := utf8.DecodeLastRune(buf[:size-leftOver]) - if re == utf8.RuneError { - log.Fatal("UTF8 Boundary error.") - } - } - - writer.Write(buf[:size-leftOver]) + writer.Write(buf[:size]) writer.Close() - - for i := 0; i < leftOver; i++ { - buf[i] = buf[size-leftOver+i] - } } }()