Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@ OPTIMIZATIONS:
-retries int number of retries
-timeout int timeout in seconds (default 10)
-delay value duration between each http request (eg: 200ms, 1s) (default -1ns)
-rsts, -response-size-to-save int max response size to save in bytes (default 512000000)
-rstr, -response-size-to-read int max response size to read in bytes (default 512000000)
-rsts, -response-size-to-save int max response size to save in bytes (default 50000000)
-rstr, -response-size-to-read int max response size to read in bytes (default 50000000)

CLOUD:
-auth configure projectdiscovery cloud (pdcp) api key (default true)
Expand Down
41 changes: 26 additions & 15 deletions common/httpx/httpx.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package httpx

import (
"bytes"
"context"
"crypto/tls"
"fmt"
Expand Down Expand Up @@ -291,21 +292,19 @@ get_response:
return nil, closeErr
}

// Todo: replace with https://github.com/projectdiscovery/utils/issues/110
resp.RawData = make([]byte, len(respbody))
copy(resp.RawData, respbody)
// Keep a reference to the undecoded body. DecodeData returns the same slice
// when no transcoding is needed (the common case), so RawData and Data end up
// sharing the same backing array and we avoid an extra full-body copy. When
// DecodeData transcodes it returns a fresh slice, so RawData still holds the
// original undecoded bytes. Both fields are read-only afterwards, so sharing
// the backing array is safe.
rawbody := respbody

respbody, err = DecodeData(respbody, httpresp.Header)
if err != nil && !shouldIgnoreBodyErrors {
return nil, err
}

respbodystr := string(respbody)

// check if we need to strip html
if h.Options.VHostStripHTML {
respbodystr = h.htmlPolicy.Sanitize(respbodystr)
}
resp.RawData = rawbody

// if content length is not defined
if resp.ContentLength <= 0 {
Expand All @@ -326,11 +325,23 @@ get_response:

// fill metrics
resp.StatusCode = httpresp.StatusCode
if respbodystr != "" {
// number of words
resp.Words = len(strings.Split(respbodystr, " "))
// number of lines
resp.Lines = len(strings.Split(strings.TrimSpace(respbodystr), "\n"))

// Word/line counts are computed directly over the body bytes to avoid
// materializing an extra full-body string copy (and the slice produced by
// strings.Split) on the hot path. When HTML stripping is enabled the
// sanitized string is required, so counts are derived from it to preserve the
// previous behavior.
if h.Options.VHostStripHTML {
respbodystr := h.htmlPolicy.Sanitize(string(respbody))
if respbodystr != "" {
resp.Words = len(strings.Split(respbodystr, " "))
resp.Lines = len(strings.Split(strings.TrimSpace(respbodystr), "\n"))
}
} else if len(respbody) > 0 {
// equivalent to len(strings.Split(string(respbody), " ")) and
// len(strings.Split(strings.TrimSpace(string(respbody)), "\n"))
resp.Words = bytes.Count(respbody, []byte{' '}) + 1
resp.Lines = bytes.Count(bytes.TrimSpace(respbody), []byte{'\n'}) + 1
}

if !h.Options.Unsafe && h.Options.TLSGrab {
Expand Down
23 changes: 16 additions & 7 deletions common/httpx/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,22 @@ import (
"github.com/projectdiscovery/networkpolicy"
)

// DefaultMaxResponseBodySize is the default maximum response body size
var DefaultMaxResponseBodySize int64

func init() {
maxResponseBodySize, _ := humanize.ParseBytes("512Mb")
DefaultMaxResponseBodySize = int64(maxResponseBodySize)
}
// DefaultMaxResponseBodySize is the default maximum response body size that httpx
// reads into memory for processing (and, via the runner, the default cap for
// responses stored to disk with -sr). It is intentionally bounded: the body is
// held in memory and the footprint scales with the number of concurrent threads,
// so a very large cap can lead to excessive memory usage / OOM on large
// responses. Normal web pages are far smaller than this; use -rstr / -rsts to
// read or store larger responses when needed.
//
// NOTE: this is a var initializer (not an init() function) on purpose. init()
// functions run after all package-level variable initializers, so computing the
// value in init() left DefaultOptions (which references it below) observing a
// zero value during package initialization.
var DefaultMaxResponseBodySize = func() int64 {
maxResponseBodySize, _ := humanize.ParseBytes("50mb")
return int64(maxResponseBodySize)
}()

// Options contains configuration options for the client
type Options struct {
Expand Down
190 changes: 190 additions & 0 deletions common/httpx/response_memory_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
package httpx

import (
"bytes"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"

"github.com/projectdiscovery/retryablehttp-go"
"github.com/stretchr/testify/require"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
)

// newLocalHTTPX builds an HTTPX instance suitable for hitting a local test
// server only (no external network, CDN checks disabled).
func newLocalHTTPX(t *testing.T) *HTTPX {
t.Helper()
options := DefaultOptions
options.CdnCheck = "false"
options.Timeout = 5 * time.Second
options.RetryMax = 0
// NB: relies on DefaultOptions.MaxResponseBodySizeToRead being non-zero
// (see TestDefaultOptionsHasNonZeroReadSize) so the body is actually read.

ht, err := New(&options)
require.NoError(t, err)
return ht
}

// doLocal issues a GET against a local httptest server and returns the parsed
// httpx Response.
func doLocal(t *testing.T, ht *HTTPX, url string) *Response {
t.Helper()
req, err := retryablehttp.NewRequest(http.MethodGet, url, nil)
require.NoError(t, err)
resp, err := ht.Do(req, UnsafeOptions{})
require.NoError(t, err)
return resp
}

// legacyWordsLines reproduces the exact word/line computation that existed
// before the refactor, so we can assert the new byte-based path is equivalent.
func legacyWordsLines(body []byte) (words, lines int) {
s := string(body)
if s != "" {
words = len(strings.Split(s, " "))
lines = len(strings.Split(strings.TrimSpace(s), "\n"))
}
return
}

// TestDefaultOptionsHasNonZeroReadSize guards against the package var-init
// ordering regression where DefaultOptions was initialized before
// DefaultMaxResponseBodySize, leaving MaxResponseBodySizeToRead at 0 (which made
// LimitReader read zero bytes and produced empty bodies for library users).
func TestDefaultOptionsHasNonZeroReadSize(t *testing.T) {
require.NotZero(t, DefaultMaxResponseBodySize)
require.Equal(t, DefaultMaxResponseBodySize, DefaultOptions.MaxResponseBodySizeToRead)
}

func TestDoBodyNoDecodePreservesRawAndData(t *testing.T) {
body := []byte("hello world\nsecond line\n")
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write(body)
}))
defer ts.Close()

resp := doLocal(t, newLocalHTTPX(t), ts.URL)

require.Equal(t, body, resp.Data, "decoded data must equal body")
require.Equal(t, body, resp.RawData, "raw data must equal undecoded body")

wantWords, wantLines := legacyWordsLines(body)
require.Equal(t, wantWords, resp.Words)
require.Equal(t, wantLines, resp.Lines)
}

func TestDoBodyEmpty(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
}))
defer ts.Close()

resp := doLocal(t, newLocalHTTPX(t), ts.URL)
require.Empty(t, resp.Data)
require.Empty(t, resp.RawData)
require.Equal(t, 0, resp.Words)
require.Equal(t, 0, resp.Lines)
}

// TestDoBodyGBKDecodeKeepsRawUndecoded ensures that when DecodeData actually
// transcodes the body, RawData still holds the original (undecoded) bytes while
// Data holds the decoded UTF-8 bytes.
func TestDoBodyGBKDecodeKeepsRawUndecoded(t *testing.T) {
utf8Body := "<html><head></head><body>你好世界 测试</body></html>"
gbkBody, _, err := transform.Bytes(simplifiedchinese.GBK.NewEncoder(), []byte(utf8Body))
require.NoError(t, err)
require.NotEqual(t, []byte(utf8Body), gbkBody, "precondition: gbk bytes differ from utf8")

ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=gbk")
_, _ = w.Write(gbkBody)
}))
defer ts.Close()

resp := doLocal(t, newLocalHTTPX(t), ts.URL)

require.Equal(t, gbkBody, resp.RawData, "RawData must hold the original undecoded bytes")
require.Equal(t, []byte(utf8Body), resp.Data, "Data must hold the decoded UTF-8 bytes")
}

// TestDoBodyNoDecodeSharesBacking documents the memory optimization: on the
// no-decode hot path RawData and Data share the same backing array (no extra
// full-body copy is made).
func TestDoBodyNoDecodeSharesBacking(t *testing.T) {
body := []byte("shared backing array body")
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = w.Write(body)
}))
defer ts.Close()

resp := doLocal(t, newLocalHTTPX(t), ts.URL)

require.NotEmpty(t, resp.Data)
require.NotEmpty(t, resp.RawData)
require.Equal(t, resp.Data, resp.RawData)
require.Same(t, &resp.Data[0], &resp.RawData[0],
"RawData and Data should share the backing array on the no-decode path")
}

// TestWordsLinesEquivalence is the core guard for the refactor: the byte-based
// counting used on the hot path must be identical to the previous
// strings.Split-based counting for a wide range of inputs and edge cases.
func TestWordsLinesEquivalence(t *testing.T) {
cases := []string{
"",
"a",
"a b c",
" ", // only spaces
"a b", // consecutive spaces
"line1\nline2\nline3", // multiple lines
"\n\n\n", // only newlines
" leading and trailing ", // surrounding whitespace
"\n mixed \t whitespace \n", // tabs/newlines around
"trailing newline\n",
"word",
"tab\tseparated values",
"unicode \u00a0 nbsp space",
"emoji 😀 and spaces ",
}

for _, c := range cases {
body := []byte(c)
wantWords, wantLines := legacyWordsLines(body)

var gotWords, gotLines int
if len(body) > 0 {
gotWords = bytes.Count(body, []byte{' '}) + 1
gotLines = bytes.Count(bytes.TrimSpace(body), []byte{'\n'}) + 1
}

require.Equalf(t, wantWords, gotWords, "words mismatch for %q", c)
require.Equalf(t, wantLines, gotLines, "lines mismatch for %q", c)
}
}

// TestBodyMetricsCountingDoesNotAllocate locks in the optimization: the
// byte-based word/line counting used on the hot path must not allocate (the
// previous string(respbody) + strings.Split approach allocated O(len(body))).
// If someone reintroduces a full-body string copy or Split-based counting, this
// test fails.
func TestBodyMetricsCountingDoesNotAllocate(t *testing.T) {
body := bytes.Repeat([]byte("lorem ipsum dolor sit amet\n"), 40000) // ~1MB
var words, lines int

allocs := testing.AllocsPerRun(50, func() {
// identical expressions to the hot path in Do()
words = bytes.Count(body, []byte{' '}) + 1
lines = bytes.Count(bytes.TrimSpace(body), []byte{'\n'}) + 1
})

require.NotZero(t, words)
require.NotZero(t, lines)
require.Zerof(t, allocs, "word/line counting must not allocate, got %v allocs/op", allocs)
}
Loading