escape.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. package yu_url
  2. import (
  3. yu_const "gogs.qqck.cn/s/tools/const"
  4. )
  5. type encoding int
  6. const (
  7. encodePath encoding = 1 + iota
  8. encodePathSegment
  9. encodeHost
  10. encodeZone
  11. encodeUserPassword
  12. encodeQueryComponent
  13. encodeFragment
  14. )
  15. // QueryEscape escapes the string so it can be safely placed
  16. // inside a URL query.
  17. func QueryEscape(s string) string {
  18. return escape(s, encodeQueryComponent)
  19. }
  20. // PathEscape escapes the string so it can be safely placed inside a URL path segment,
  21. // replacing special characters (including /) with %XX sequences as needed.
  22. func PathEscape(s string) string {
  23. return escape(s, encodePathSegment)
  24. }
  25. func escape(s string, mode encoding) string {
  26. spaceCount, hexCount := 0, 0
  27. for i := 0; i < len(s); i++ {
  28. c := s[i]
  29. if shouldEscape(c, mode) {
  30. if c == ' ' && mode == encodeQueryComponent {
  31. spaceCount++
  32. } else {
  33. hexCount++
  34. }
  35. }
  36. }
  37. if spaceCount == 0 && hexCount == 0 {
  38. return s
  39. }
  40. var buf [64]byte
  41. var t []byte
  42. required := len(s) + 2*hexCount
  43. if required <= len(buf) {
  44. t = buf[:required]
  45. } else {
  46. t = make([]byte, required)
  47. }
  48. if hexCount == 0 {
  49. copy(t, s)
  50. for i := 0; i < len(s); i++ {
  51. if s[i] == ' ' {
  52. t[i] = '+'
  53. }
  54. }
  55. return string(t)
  56. }
  57. j := 0
  58. for i := 0; i < len(s); i++ {
  59. switch c := s[i]; {
  60. case c == ' ' && mode == encodeQueryComponent:
  61. t[j] = '+'
  62. j++
  63. case shouldEscape(c, mode):
  64. t[j] = '%'
  65. t[j+1] = yu_const.HexUpper[c>>4]
  66. t[j+2] = yu_const.HexUpper[c&15]
  67. j += 3
  68. default:
  69. t[j] = s[i]
  70. j++
  71. }
  72. }
  73. return string(t)
  74. }
  75. // Return true if the specified character should be escaped when
  76. // appearing in a URL string, according to RFC 3986.
  77. //
  78. // Please be informed that for now shouldEscape does not check all
  79. // reserved characters correctly. See golang.org/issue/5684.
  80. func shouldEscape(c byte, mode encoding) bool {
  81. // §2.3 Unreserved characters (alphanum)
  82. if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
  83. return false
  84. }
  85. if mode == encodeHost || mode == encodeZone {
  86. // §3.2.2 Host allows
  87. // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
  88. // as part of reg-name.
  89. // We add : because we include :port as part of host.
  90. // We add [ ] because we include [ipv6]:port as part of host.
  91. // We add < > because they're the only characters left that
  92. // we could possibly allow, and Parse will reject them if we
  93. // escape them (because hosts can't use %-encoding for
  94. // ASCII bytes).
  95. switch c {
  96. case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
  97. return false
  98. }
  99. }
  100. switch c {
  101. case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
  102. return false
  103. case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
  104. // Different sections of the URL allow a few of
  105. // the reserved characters to appear unescaped.
  106. switch mode {
  107. case encodePath: // §3.3
  108. // The RFC allows : @ & = + $ but saves / ; , for assigning
  109. // meaning to individual path segments. This package
  110. // only manipulates the path as a whole, so we allow those
  111. // last three as well. That leaves only ? to escape.
  112. return c == '?'
  113. case encodePathSegment: // §3.3
  114. // The RFC allows : @ & = + $ but saves / ; , for assigning
  115. // meaning to individual path segments.
  116. return c == '/' || c == ';' || c == ',' || c == '?'
  117. case encodeUserPassword: // §3.2.1
  118. // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
  119. // userinfo, so we must escape only '@', '/', and '?'.
  120. // The parsing of userinfo treats ':' as special so we must escape
  121. // that too.
  122. return c == '@' || c == '/' || c == '?' || c == ':'
  123. case encodeQueryComponent: // §3.4
  124. // The RFC reserves (so we must escape) everything.
  125. return true
  126. case encodeFragment: // §4.1
  127. // The RFC text is silent but the grammar allows
  128. // everything, so escape nothing.
  129. return false
  130. }
  131. }
  132. if mode == encodeFragment {
  133. // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
  134. // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
  135. // need to be escaped. To minimize potential breakage, we apply two restrictions:
  136. // (1) we always escape sub-delims outside of the fragment, and (2) we always
  137. // escape single quote to avoid breaking callers that had previously assumed that
  138. // single quotes would be escaped. See issue #19917.
  139. switch c {
  140. case '!', '(', ')', '*':
  141. return false
  142. }
  143. }
  144. // Everything else must be escaped.
  145. return true
  146. }
  147. // QueryUnescape does the inverse transformation of QueryEscape,
  148. // converting each 3-byte encoded substring of the form "%AB" into the
  149. // hex-decoded byte 0xAB.
  150. // It returns an error if any % is not followed by two hexadecimal
  151. // digits.
  152. func QueryUnescape(s string) string {
  153. return unescape(s, encodeQueryComponent)
  154. }
  155. // PathUnescape does the inverse transformation of PathEscape,
  156. // converting each 3-byte encoded substring of the form "%AB" into the
  157. // hex-decoded byte 0xAB. It returns an error if any % is not followed
  158. // by two hexadecimal digits.
  159. //
  160. // PathUnescape is identical to QueryUnescape except that it does not
  161. // unescape '+' to ' ' (space).
  162. func PathUnescape(s string) string {
  163. return unescape(s, encodePathSegment)
  164. }
  165. // unescape unescapes a string; the mode specifies
  166. // which section of the URL string is being unescaped.
  167. func unescape(s string, mode encoding) string {
  168. // Count %, check that they're well-formed.
  169. n := 0
  170. hasPlus := false
  171. for i := 0; i < len(s); {
  172. switch s[i] {
  173. case '%':
  174. n++
  175. if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
  176. s = s[i:]
  177. if len(s) > 3 {
  178. s = s[:3]
  179. }
  180. return ""
  181. }
  182. // Per https://tools.ietf.org/html/rfc3986#page-21
  183. // in the host component %-encoding can only be used
  184. // for non-ASCII bytes.
  185. // But https://tools.ietf.org/html/rfc6874#section-2
  186. // introduces %25 being allowed to escape a percent sign
  187. // in IPv6 scoped-address literals. Yay.
  188. if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
  189. return ""
  190. }
  191. if mode == encodeZone {
  192. // RFC 6874 says basically "anything goes" for zone identifiers
  193. // and that even non-ASCII can be redundantly escaped,
  194. // but it seems prudent to restrict %-escaped bytes here to those
  195. // that are valid host name bytes in their unescaped form.
  196. // That is, you can use escaping in the zone identifier but not
  197. // to introduce bytes you couldn't just write directly.
  198. // But Windows puts spaces here! Yay.
  199. v := unhex(s[i+1])<<4 | unhex(s[i+2])
  200. if s[i:i+3] != "%25" && v != ' ' && shouldEscape(v, encodeHost) {
  201. return ""
  202. }
  203. }
  204. i += 3
  205. case '+':
  206. hasPlus = mode == encodeQueryComponent
  207. i++
  208. default:
  209. if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
  210. return ""
  211. }
  212. i++
  213. }
  214. }
  215. if n == 0 && !hasPlus {
  216. return s
  217. }
  218. t := make([]byte, 0, len(s)-2*n)
  219. for i := 0; i < len(s); i++ {
  220. switch s[i] {
  221. case '%':
  222. t = append(t, unhex(s[i+1])<<4|unhex(s[i+2]))
  223. i += 2
  224. case '+':
  225. if mode == encodeQueryComponent {
  226. t = append(t, ' ')
  227. } else {
  228. t = append(t, '+')
  229. }
  230. default:
  231. t = append(t, s[i])
  232. }
  233. }
  234. return string(t)
  235. }
  236. func ishex(c byte) bool {
  237. switch {
  238. case '0' <= c && c <= '9':
  239. return true
  240. case 'a' <= c && c <= 'f':
  241. return true
  242. case 'A' <= c && c <= 'F':
  243. return true
  244. }
  245. return false
  246. }
  247. func unhex(c byte) byte {
  248. switch {
  249. case '0' <= c && c <= '9':
  250. return c - '0'
  251. case 'a' <= c && c <= 'f':
  252. return c - 'a' + 10
  253. case 'A' <= c && c <= 'F':
  254. return c - 'A' + 10
  255. }
  256. return 0
  257. }