Skip to content

Commit fa1fa1f

Browse files
committed
Extract URI parsing logic into lib/http/uri/parsing.rb
1 parent 372a5bd commit fa1fa1f

4 files changed

Lines changed: 232 additions & 188 deletions

File tree

lib/http/uri.rb

Lines changed: 4 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ module HTTP
77
#
88
# Stores URI components as instance variables. Addressable is only used
99
# when parsing non-ASCII (IRI) strings; ASCII URIs use stdlib's URI.parse.
10-
class URI # rubocop:disable Metrics/ClassLength
10+
class URI
1111
# The URI given was not valid
1212
class InvalidError < HTTP::RequestError; end
1313

@@ -108,55 +108,6 @@ class InvalidError < HTTP::RequestError; end
108108
# @private
109109
NEEDS_ADDRESSABLE = /[^\x20-\x7E]/
110110

111-
# Parse the given URI string, returning an HTTP::URI object
112-
#
113-
# @example
114-
# HTTP::URI.parse("http://example.com/path")
115-
#
116-
# @param [HTTP::URI, String, #to_str] uri to parse
117-
#
118-
# @api public
119-
# @return [HTTP::URI] new URI instance
120-
def self.parse(uri)
121-
return uri if uri.is_a?(self)
122-
raise InvalidError, "invalid URI: nil" if uri.nil?
123-
124-
uri_string = begin
125-
String(uri)
126-
rescue TypeError, NoMethodError
127-
raise InvalidError, "invalid URI: #{uri.inspect}"
128-
end
129-
new(**parse_components(uri_string))
130-
end
131-
132-
# Encodes key/value pairs as application/x-www-form-urlencoded
133-
#
134-
# @example
135-
# HTTP::URI.form_encode(foo: "bar")
136-
#
137-
# @param [#to_hash, #to_ary] form_values to encode
138-
# @param [TrueClass, FalseClass] sort should key/value pairs be sorted first?
139-
#
140-
# @api public
141-
# @return [String] encoded value
142-
def self.form_encode(form_values, sort: false)
143-
return ::URI.encode_www_form(form_values) unless sort
144-
145-
::URI.encode_www_form(form_values.sort_by { |k, _| String(k) })
146-
end
147-
148-
# Percent-encode matching characters in a string
149-
#
150-
# @param [String] string raw string
151-
#
152-
# @api private
153-
# @return [String] encoded value
154-
def self.percent_encode(string)
155-
string&.gsub(PERCENT_ENCODE) do |substr|
156-
substr.bytes.map { |c| format("%%%02X", c) }.join
157-
end
158-
end
159-
160111
# Creates an HTTP::URI instance from the given keyword arguments
161112
#
162113
# @example
@@ -383,19 +334,10 @@ def dup
383334
#
384335
# @api public
385336
# @return [String] URI serialized as a String
386-
def to_s # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength
337+
def to_s
387338
str = +""
388339
str << "#{@scheme}:" if @scheme
389-
if (raw_host = @raw_host)
390-
str << "//"
391-
if (user = @user)
392-
str << user
393-
str << ":#{@password}" if @password
394-
str << "@"
395-
end
396-
str << raw_host
397-
str << ":#{@port}" if @port
398-
end
340+
str << authority_string if @raw_host
399341
str << @path
400342
str << "?#{@query}" if @query
401343
str << "##{@fragment}" if @fragment
@@ -427,118 +369,8 @@ def deconstruct_keys(keys)
427369
query: @query, fragment: @fragment, user: @user, password: @password }
428370
keys ? hash.slice(*keys) : hash
429371
end
430-
431-
# Loads the addressable gem on first use
432-
#
433-
# @api private
434-
# @return [void]
435-
# @raise [LoadError] if addressable gem is not installed
436-
def self.require_addressable
437-
return if defined?(@addressable_loaded)
438-
439-
require "addressable/uri"
440-
@addressable_loaded = true
441-
end
442-
443-
# Convert a hostname to ASCII via IDNA (requires addressable)
444-
#
445-
# @param [String] host hostname to encode
446-
# @api private
447-
# @return [String] ASCII-encoded hostname
448-
def self.idna_to_ascii(host)
449-
return host if host.ascii_only?
450-
451-
require_addressable
452-
Addressable::IDNA.to_ascii(host) # steep:ignore
453-
end
454-
455-
private
456-
457-
# Adds or removes IPv6 brackets from a host
458-
#
459-
# @param [String] raw_host
460-
# @param [Boolean] brackets
461-
# @api private
462-
# @return [String] Host with IPv6 address brackets added or removed
463-
def process_ipv6_brackets(raw_host, brackets: false)
464-
return unless raw_host
465-
466-
stripped = raw_host.delete_prefix("[").delete_suffix("]")
467-
ip = IPAddr.new(stripped)
468-
469-
if ip.ipv6?
470-
brackets ? "[#{ip}]" : ip.to_s
471-
else
472-
raw_host
473-
end
474-
rescue IPAddr::Error
475-
raw_host
476-
end
477-
478-
# Normalize a host for comparison and lookup
479-
#
480-
# Percent-decodes, strips trailing dot, lowercases, and IDN-encodes
481-
# non-ASCII hostnames.
482-
#
483-
# @param [String, nil] host the host to normalize
484-
# @api private
485-
# @return [String, nil] normalized host
486-
def normalize_host(host)
487-
return nil unless host
488-
489-
h = host.gsub(/%\h{2}/) { |match| match.delete_prefix("%").to_i(16).chr }
490-
h = h.delete_suffix(".")
491-
h = h.downcase
492-
self.class.idna_to_ascii(h)
493-
end
494-
495-
# Parse a URI string into component parts
496-
#
497-
# Uses stdlib for printable-ASCII URIs (faster), falling back to
498-
# Addressable for non-ASCII or when stdlib rejects the input.
499-
#
500-
# @param [String] uri_string the URI to parse
501-
# @api private
502-
# @return [Hash] URI components
503-
private_class_method def self.parse_components(uri_string)
504-
return parse_with_addressable(uri_string) if uri_string.match?(NEEDS_ADDRESSABLE)
505-
506-
parse_with_stdlib(uri_string) || parse_with_addressable(uri_string)
507-
end
508-
509-
# Parse an ASCII URI using stdlib
510-
#
511-
# @param [String] uri_string the URI to parse
512-
# @api private
513-
# @return [Hash, nil] URI components, or nil if stdlib rejects the input
514-
private_class_method def self.parse_with_stdlib(uri_string)
515-
parsed = ::URI.parse(uri_string)
516-
# stdlib always returns a port (defaulting to scheme's default);
517-
# only store it when explicitly specified
518-
port = parsed.port
519-
port = nil if port == parsed.default_port
520-
{ scheme: parsed.scheme, user: parsed.user, password: parsed.password,
521-
host: parsed.host, port: port, path: parsed.path,
522-
query: parsed.query, fragment: parsed.fragment }
523-
rescue ::URI::InvalidURIError
524-
nil
525-
end
526-
527-
# Parse a non-ASCII URI using Addressable
528-
#
529-
# @param [String] uri_string the URI to parse
530-
# @api private
531-
# @return [Hash] URI components
532-
private_class_method def self.parse_with_addressable(uri_string)
533-
require_addressable
534-
parsed = Addressable::URI.parse(uri_string) # steep:ignore
535-
{ scheme: parsed.scheme, user: parsed.user, password: parsed.password,
536-
host: parsed.host, port: parsed.port, path: parsed.path,
537-
query: parsed.query, fragment: parsed.fragment }
538-
rescue Addressable::URI::InvalidURIError # steep:ignore
539-
raise InvalidError, "invalid URI: #{uri_string.inspect}"
540-
end
541372
end
542373
end
543374

375+
require "http/uri/parsing"
544376
require "http/uri/normalizer"

lib/http/uri/normalizer.rb

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# frozen_string_literal: true
22

33
module HTTP
4-
class URI # rubocop:disable Style/Documentation
4+
# URI normalization and dot-segment removal
5+
class URI
56
# Default URI normalizer
67
# @private
78
NORMALIZER = lambda do |uri|
@@ -27,31 +28,54 @@ class URI # rubocop:disable Style/Documentation
2728
# @private
2829
DOT_SEGMENTS = %w[. ..].freeze
2930

31+
# Matches "/." followed by "/" or end-of-string
32+
# @private
33+
SINGLE_DOT_SEGMENT = %r{\A/\.(?:/|\z)}
34+
35+
# Matches "/.." followed by "/" or end-of-string
36+
# @private
37+
DOUBLE_DOT_SEGMENT = %r{\A/\.\.(?:/|\z)}
38+
39+
# Matches the last segment in a path (everything after the final "/")
40+
# @private
41+
LAST_SEGMENT = %r{/[^/]*\z}
42+
43+
# Matches the first path segment, with or without a leading "/"
44+
# @private
45+
FIRST_SEGMENT = %r{\A/?[^/]*}
46+
3047
# Remove dot segments from a URI path per RFC 3986 Section 5.2.4
3148
#
3249
# @param [String] path URI path to normalize
3350
#
3451
# @api private
3552
# @return [String] path with dot segments removed
36-
def self.remove_dot_segments(path) # rubocop:disable Metrics/MethodLength
53+
def self.remove_dot_segments(path)
3754
input = path.dup
3855
output = +""
39-
40-
until input.empty?
41-
unless input.delete_prefix!("../") || input.delete_prefix!("./") ||
42-
input.sub!(%r{\A/\.(?:/|\z)}, "/")
43-
if input.sub!(%r{\A/\.\.(?:/|\z)}, "/")
44-
output.sub!(%r{/[^/]*\z}, "")
45-
elsif DOT_SEGMENTS.include?(input)
46-
break
47-
else
48-
output << input.slice!(%r{\A/?[^/]*}) # steep:ignore
49-
end
50-
end
51-
end
52-
56+
reduce_dot_segment(input, output) until input.empty?
5357
output
5458
end
5559
private_class_method :remove_dot_segments
60+
61+
# Process a single dot-segment removal step per RFC 3986 Section 5.2.4
62+
#
63+
# @param [String] input remaining path input (mutated)
64+
# @param [String] output accumulated result (mutated)
65+
#
66+
# @api private
67+
# @return [void]
68+
private_class_method def self.reduce_dot_segment(input, output)
69+
return if input.delete_prefix!("../") || input.delete_prefix!("./") ||
70+
input.sub!(SINGLE_DOT_SEGMENT, "/")
71+
72+
if input.sub!(DOUBLE_DOT_SEGMENT, "/")
73+
output.sub!(LAST_SEGMENT, "")
74+
elsif DOT_SEGMENTS.include?(input)
75+
input.clear
76+
else
77+
output << input.slice!(FIRST_SEGMENT) # steep:ignore
78+
end
79+
end
5680
end
5781
end

0 commit comments

Comments
 (0)