parsing.go 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. package govaluate
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "regexp"
  7. "strconv"
  8. "time"
  9. "unicode"
  10. )
  11. func parseTokens(expression string, functions map[string]ExpressionFunction) ([]ExpressionToken, error) {
  12. var ret []ExpressionToken
  13. var token ExpressionToken
  14. var stream *lexerStream
  15. var state lexerState
  16. var err error
  17. var found bool
  18. stream = newLexerStream(expression)
  19. state = validLexerStates[0]
  20. for stream.canRead() {
  21. token, err, found = readToken(stream, state, functions)
  22. if err != nil {
  23. return ret, err
  24. }
  25. if !found {
  26. break
  27. }
  28. state, err = getLexerStateForToken(token.Kind)
  29. if err != nil {
  30. return ret, err
  31. }
  32. // append this valid token
  33. ret = append(ret, token)
  34. }
  35. err = checkBalance(ret)
  36. if err != nil {
  37. return nil, err
  38. }
  39. return ret, nil
  40. }
  41. func readToken(stream *lexerStream, state lexerState, functions map[string]ExpressionFunction) (ExpressionToken, error, bool) {
  42. var function ExpressionFunction
  43. var ret ExpressionToken
  44. var tokenValue interface{}
  45. var tokenTime time.Time
  46. var tokenString string
  47. var kind TokenKind
  48. var character rune
  49. var found bool
  50. var completed bool
  51. var err error
  52. // numeric is 0-9, or .
  53. // string starts with '
  54. // variable is alphanumeric, always starts with a letter
  55. // bracket always means variable
  56. // symbols are anything non-alphanumeric
  57. // all others read into a buffer until they reach the end of the stream
  58. for stream.canRead() {
  59. character = stream.readCharacter()
  60. if unicode.IsSpace(character) {
  61. continue
  62. }
  63. kind = UNKNOWN
  64. // numeric constant
  65. if isNumeric(character) {
  66. tokenString = readTokenUntilFalse(stream, isNumeric)
  67. tokenValue, err = strconv.ParseFloat(tokenString, 64)
  68. if err != nil {
  69. errorMsg := fmt.Sprintf("Unable to parse numeric value '%v' to float64\n", tokenString)
  70. return ExpressionToken{}, errors.New(errorMsg), false
  71. }
  72. kind = NUMERIC
  73. break
  74. }
  75. // comma, separator
  76. if character == ',' {
  77. tokenValue = ","
  78. kind = SEPARATOR
  79. break
  80. }
  81. // escaped variable
  82. if character == '[' {
  83. tokenValue, completed = readUntilFalse(stream, true, false, true, isNotClosingBracket)
  84. kind = VARIABLE
  85. if !completed {
  86. return ExpressionToken{}, errors.New("Unclosed parameter bracket"), false
  87. }
  88. // above method normally rewinds us to the closing bracket, which we want to skip.
  89. stream.rewind(-1)
  90. break
  91. }
  92. // regular variable - or function?
  93. if unicode.IsLetter(character) {
  94. tokenString = readTokenUntilFalse(stream, isVariableName)
  95. tokenValue = tokenString
  96. kind = VARIABLE
  97. // boolean?
  98. if tokenValue == "true" {
  99. kind = BOOLEAN
  100. tokenValue = true
  101. } else {
  102. if tokenValue == "false" {
  103. kind = BOOLEAN
  104. tokenValue = false
  105. }
  106. }
  107. // textual operator?
  108. if tokenValue == "in" || tokenValue == "IN" {
  109. // force lower case for consistency
  110. tokenValue = "in"
  111. kind = COMPARATOR
  112. }
  113. // function?
  114. function, found = functions[tokenString]
  115. if found {
  116. kind = FUNCTION
  117. tokenValue = function
  118. }
  119. break
  120. }
  121. if !isNotQuote(character) {
  122. tokenValue, completed = readUntilFalse(stream, true, false, true, isNotQuote)
  123. if !completed {
  124. return ExpressionToken{}, errors.New("Unclosed string literal"), false
  125. }
  126. // advance the stream one position, since reading until false assumes the terminator is a real token
  127. stream.rewind(-1)
  128. // check to see if this can be parsed as a time.
  129. tokenTime, found = tryParseTime(tokenValue.(string))
  130. if found {
  131. kind = TIME
  132. tokenValue = tokenTime
  133. } else {
  134. kind = STRING
  135. }
  136. break
  137. }
  138. if character == '(' {
  139. tokenValue = character
  140. kind = CLAUSE
  141. break
  142. }
  143. if character == ')' {
  144. tokenValue = character
  145. kind = CLAUSE_CLOSE
  146. break
  147. }
  148. // must be a known symbol
  149. tokenString = readTokenUntilFalse(stream, isNotAlphanumeric)
  150. tokenValue = tokenString
  151. // quick hack for the case where "-" can mean "prefixed negation" or "minus", which are used
  152. // very differently.
  153. if state.canTransitionTo(PREFIX) {
  154. _, found = prefixSymbols[tokenString]
  155. if found {
  156. kind = PREFIX
  157. break
  158. }
  159. }
  160. _, found = modifierSymbols[tokenString]
  161. if found {
  162. kind = MODIFIER
  163. break
  164. }
  165. _, found = logicalSymbols[tokenString]
  166. if found {
  167. kind = LOGICALOP
  168. break
  169. }
  170. _, found = comparatorSymbols[tokenString]
  171. if found {
  172. kind = COMPARATOR
  173. break
  174. }
  175. _, found = ternarySymbols[tokenString]
  176. if found {
  177. kind = TERNARY
  178. break
  179. }
  180. errorMessage := fmt.Sprintf("Invalid token: '%s'", tokenString)
  181. return ret, errors.New(errorMessage), false
  182. }
  183. ret.Kind = kind
  184. ret.Value = tokenValue
  185. return ret, nil, (kind != UNKNOWN)
  186. }
  187. func readTokenUntilFalse(stream *lexerStream, condition func(rune) bool) string {
  188. var ret string
  189. stream.rewind(1)
  190. ret, _ = readUntilFalse(stream, false, true, true, condition)
  191. return ret
  192. }
  193. /*
  194. Returns the string that was read until the given [condition] was false, or whitespace was broken.
  195. Returns false if the stream ended before whitespace was broken or condition was met.
  196. */
  197. func readUntilFalse(stream *lexerStream, includeWhitespace bool, breakWhitespace bool, allowEscaping bool, condition func(rune) bool) (string, bool) {
  198. var tokenBuffer bytes.Buffer
  199. var character rune
  200. var conditioned bool
  201. conditioned = false
  202. for stream.canRead() {
  203. character = stream.readCharacter()
  204. // Use backslashes to escape anything
  205. if allowEscaping && character == '\\' {
  206. character = stream.readCharacter()
  207. tokenBuffer.WriteString(string(character))
  208. continue
  209. }
  210. if unicode.IsSpace(character) {
  211. if breakWhitespace && tokenBuffer.Len() > 0 {
  212. conditioned = true
  213. break
  214. }
  215. if !includeWhitespace {
  216. continue
  217. }
  218. }
  219. if condition(character) {
  220. tokenBuffer.WriteString(string(character))
  221. } else {
  222. conditioned = true
  223. stream.rewind(1)
  224. break
  225. }
  226. }
  227. return tokenBuffer.String(), conditioned
  228. }
  229. /*
  230. Checks to see if any optimizations can be performed on the given [tokens], which form a complete, valid expression.
  231. The returns slice will represent the optimized (or unmodified) list of tokens to use.
  232. */
  233. func optimizeTokens(tokens []ExpressionToken) ([]ExpressionToken, error) {
  234. var token ExpressionToken
  235. var symbol OperatorSymbol
  236. var err error
  237. var index int
  238. for index, token = range tokens {
  239. // if we find a regex operator, and the right-hand value is a constant, precompile and replace with a pattern.
  240. if token.Kind != COMPARATOR {
  241. continue
  242. }
  243. symbol = comparatorSymbols[token.Value.(string)]
  244. if symbol != REQ && symbol != NREQ {
  245. continue
  246. }
  247. index++
  248. token = tokens[index]
  249. if token.Kind == STRING {
  250. token.Kind = PATTERN
  251. token.Value, err = regexp.Compile(token.Value.(string))
  252. if err != nil {
  253. return tokens, err
  254. }
  255. tokens[index] = token
  256. }
  257. }
  258. return tokens, nil
  259. }
  260. /*
  261. Checks the balance of tokens which have multiple parts, such as parenthesis.
  262. */
  263. func checkBalance(tokens []ExpressionToken) error {
  264. var stream *tokenStream
  265. var token ExpressionToken
  266. var parens int
  267. stream = newTokenStream(tokens)
  268. for stream.hasNext() {
  269. token = stream.next()
  270. if token.Kind == CLAUSE {
  271. parens++
  272. continue
  273. }
  274. if token.Kind == CLAUSE_CLOSE {
  275. parens--
  276. continue
  277. }
  278. }
  279. if parens != 0 {
  280. return errors.New("Unbalanced parenthesis")
  281. }
  282. return nil
  283. }
  284. func isNumeric(character rune) bool {
  285. return unicode.IsDigit(character) || character == '.'
  286. }
  287. func isNotQuote(character rune) bool {
  288. return character != '\'' && character != '"'
  289. }
  290. func isNotAlphanumeric(character rune) bool {
  291. return !(unicode.IsDigit(character) ||
  292. unicode.IsLetter(character) ||
  293. character == '(' ||
  294. character == ')' ||
  295. !isNotQuote(character))
  296. }
  297. func isVariableName(character rune) bool {
  298. return unicode.IsLetter(character) ||
  299. unicode.IsDigit(character) ||
  300. character == '_'
  301. }
  302. func isNotClosingBracket(character rune) bool {
  303. return character != ']'
  304. }
  305. /*
  306. Attempts to parse the [candidate] as a Time.
  307. Tries a series of standardized date formats, returns the Time if one applies,
  308. otherwise returns false through the second return.
  309. */
  310. func tryParseTime(candidate string) (time.Time, bool) {
  311. var ret time.Time
  312. var found bool
  313. timeFormats := [...]string{
  314. time.ANSIC,
  315. time.UnixDate,
  316. time.RubyDate,
  317. time.Kitchen,
  318. time.RFC3339,
  319. time.RFC3339Nano,
  320. "2006-01-02", // RFC 3339
  321. "2006-01-02 15:04", // RFC 3339 with minutes
  322. "2006-01-02 15:04:05", // RFC 3339 with seconds
  323. "2006-01-02 15:04:05-07:00", // RFC 3339 with seconds and timezone
  324. "2006-01-02T15Z0700", // ISO8601 with hour
  325. "2006-01-02T15:04Z0700", // ISO8601 with minutes
  326. "2006-01-02T15:04:05Z0700", // ISO8601 with seconds
  327. "2006-01-02T15:04:05.999999999Z0700", // ISO8601 with nanoseconds
  328. }
  329. for _, format := range timeFormats {
  330. ret, found = tryParseExactTime(candidate, format)
  331. if found {
  332. return ret, true
  333. }
  334. }
  335. return time.Now(), false
  336. }
  337. func tryParseExactTime(candidate string, format string) (time.Time, bool) {
  338. var ret time.Time
  339. var err error
  340. ret, err = time.ParseInLocation(format, candidate, time.Local)
  341. if err != nil {
  342. return time.Now(), false
  343. }
  344. return ret, true
  345. }