Skip to content

Commit 0ef2f65

Browse files
authored
Merge pull request #362 from MITLibraries/use-410-turnstile-gem
Implement bot detection and verification
2 parents 6c44c1d + a01ead6 commit 0ef2f65

14 files changed

Lines changed: 332 additions & 1 deletion

File tree

.env.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
ALMA_OPENURL=https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?
2+
TURNSTILE_SITEKEY=test-sitekey
3+
TURNSTILE_SECRET=test-secret
24
FEATURE_TIMDEX_FULLTEXT=true
35
FEATURE_GEODATA=false
46
MIT_PRIMO_URL=https://mit.primo.exlibrisgroup.com

Gemfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ git_source(:github) { |repo| "https://github.com/#{repo}.git" }
44
ruby '3.4.8'
55

66
gem 'bootsnap', require: false
7+
gem 'crawler_detect'
78
gem 'graphql'
89
gem 'graphql-client'
910
gem 'http'
@@ -14,6 +15,7 @@ gem 'openssl'
1415
gem 'puma'
1516
gem 'rack-attack'
1617
gem 'rack-timeout'
18+
gem 'rails_cloudflare_turnstile'
1719
gem 'rails', '~> 7.2.0'
1820
gem 'redis'
1921
gem 'scout_apm'

Gemfile.lock

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ GEM
120120
bigdecimal
121121
rexml
122122
crass (1.0.6)
123+
crawler_detect (1.2.9)
124+
qonfig (>= 0.24)
123125
date (3.5.1)
124126
debug (1.11.1)
125127
irb (~> 1.10)
@@ -134,6 +136,12 @@ GEM
134136
drb (2.2.3)
135137
erb (5.1.3)
136138
erubi (1.13.1)
139+
faraday (2.14.1)
140+
faraday-net_http (>= 2.0, < 3.5)
141+
json
142+
logger
143+
faraday-net_http (3.4.2)
144+
net-http (~> 0.5)
137145
ffi (1.17.2-aarch64-linux-gnu)
138146
ffi (1.17.2-arm64-darwin)
139147
ffi (1.17.2-x86_64-darwin)
@@ -206,6 +214,8 @@ GEM
206214
mocha (2.8.2)
207215
ruby2_keywords (>= 0.0.5)
208216
msgpack (1.8.0)
217+
net-http (0.9.1)
218+
uri (>= 0.11.1)
209219
net-imap (0.5.13)
210220
date
211221
net-protocol
@@ -243,6 +253,8 @@ GEM
243253
public_suffix (6.0.2)
244254
puma (7.2.0)
245255
nio4r (~> 2.0)
256+
qonfig (0.30.0)
257+
base64 (>= 0.2)
246258
racc (1.8.1)
247259
rack (3.1.20)
248260
rack-attack (6.8.0)
@@ -276,6 +288,9 @@ GEM
276288
rails-html-sanitizer (1.7.0)
277289
loofah (~> 2.25)
278290
nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0)
291+
rails_cloudflare_turnstile (0.5.0)
292+
faraday (>= 1.0, < 3.0)
293+
rails (>= 6.0, < 8.2)
279294
railties (7.2.3)
280295
actionpack (= 7.2.3)
281296
activesupport (= 7.2.3)
@@ -381,6 +396,7 @@ GEM
381396
unicode-display_width (3.2.0)
382397
unicode-emoji (~> 4.1)
383398
unicode-emoji (4.2.0)
399+
uri (1.1.1)
384400
useragent (0.16.11)
385401
vcr (6.4.0)
386402
web-console (4.2.1)
@@ -421,6 +437,7 @@ DEPENDENCIES
421437
bootsnap
422438
capybara
423439
climate_control
440+
crawler_detect
424441
debug
425442
dotenv-rails
426443
graphql
@@ -437,6 +454,7 @@ DEPENDENCIES
437454
rack-attack
438455
rack-timeout
439456
rails (~> 7.2.0)
457+
rails_cloudflare_turnstile
440458
redis
441459
rubocop
442460
rubocop-rails

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ See `Optional Environment Variables` for more information.
9595
- `BOOLEAN_OPTIONS`: comma separated list of values to present to testers on instances where `BOOLEAN_PICKER` feature is enabled.
9696
- `FEATURE_BOOLEAN_PICKER`: feature to allow users to select their preferred boolean type. If set to `true`, feature is enabled. This feature is only intended for internal team
9797
testing and should never be enabled in production (mostly because the UI is a mess more than it would cause harm).
98+
- `FEATURE_BOT_DETECTION`: When set to `true`, enables bot detection using crawler_detect and Cloudflare Turnstile challenges for suspected bots on search result pages. Requires `TURNSTILE_SITEKEY` and `TURNSTILE_SECRET` to be set. If disabled, bots may crawl search results freely.
9899
- `FEATURE_GEODATA`: Enables features related to geospatial data discovery. Setting this variable to `true` will trigger geodata
99100
mode. Note that this is currently intended _only_ for the geodata app and
100101
may have unexpected consequences if applied to other TIMDEX UI apps.
@@ -146,6 +147,8 @@ instance is sending what search traffic. Defaults to "unset" if not defined.
146147
- `TIMDEX_INDEX`: Name of the index, or alias, to provide to the GraphQL endpoint. Defaults to `nil` which will let TIMDEX determine the best index to use. Wildcard values can be set, for example `rdi*` would search any indexes that begin with `rdi` in the underlying OpenSearch instance behind TIMDEX.
147148
- `TIMDEX_SOURCES`: Comma-separated list of sources to display in the advanced-search source selection element. This
148149
overrides the default which is set in ApplicationHelper.
150+
- `TURNSTILE_SECRET`: The Cloudflare Turnstile secret key used to verify challenge responses. If not set, bot challenge protection is disabled.
151+
- `TURNSTILE_SITEKEY`: The Cloudflare Turnstile site key used to render the challenge widget. If not set, bot challenge protection is disabled.
149152

150153
#### Test Environment-only Variables
151154

app/controllers/search_controller.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ class SearchController < ApplicationController
22
before_action :validate_q!, only: %i[results]
33
before_action :validate_format_token, only: %i[results]
44
before_action :set_active_tab, only: %i[results]
5+
before_action :challenge_bots!, only: %i[results]
56
around_action :sleep_if_too_fast, only: %i[results]
67

78
before_action :validate_geobox_presence!, only: %i[results]
@@ -271,6 +272,15 @@ def validate_q!
271272
redirect_to root_url
272273
end
273274

275+
# Redirect suspected crawlers to Turnstile when the bot_detection feature is enabled.
276+
def challenge_bots!
277+
return unless Feature.enabled?(:bot_detection)
278+
return if session[:passed_turnstile]
279+
return unless BotDetector.should_challenge?(request)
280+
281+
redirect_to turnstile_path(return_to: request.fullpath)
282+
end
283+
274284
def validate_geodistance_presence!
275285
return unless Feature.enabled?(:geodata)
276286

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
class TurnstileController < ApplicationController
2+
before_action :validate_cloudflare_turnstile, only: :verify
3+
4+
rescue_from RailsCloudflareTurnstile::Forbidden, with: :handle_forbidden
5+
6+
def show
7+
@return_to = params[:return_to].presence || root_path
8+
end
9+
10+
def verify
11+
session[:passed_turnstile] = true
12+
redirect_to safe_return_path
13+
end
14+
15+
private
16+
17+
# Handles Turnstile rejecting token submission due to invalid token, network issue, etc.
18+
def handle_forbidden
19+
flash.now[:error] = "We couldn't complete the verification. Please try again."
20+
render :show, status: :unprocessable_entity
21+
end
22+
23+
# Returns a safe path to redirect to after Turnstile verification. Valid paths should begin with
24+
# a single slash. Falls back to root_path if the provided path is invalid.
25+
def safe_return_path
26+
return_to = params[:return_to].to_s
27+
return root_path if return_to.blank?
28+
return root_path if return_to.start_with?('//')
29+
return return_to if return_to.start_with?('/')
30+
31+
root_path
32+
end
33+
end

app/models/bot_detector.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
class BotDetector
2+
# Returns true if the request appears to be a bot according to crawler_detect.
3+
def self.bot?(request)
4+
ua = request.user_agent.to_s
5+
detector = CrawlerDetect.new(ua)
6+
detector.is_crawler?
7+
rescue StandardError => e
8+
Rails.logger.debug("BotDetector: crawler_detect failed for UA '#{ua}': #{e.message}")
9+
false
10+
end
11+
12+
# Returns true when the request appears to be performing crawling behavior that we
13+
# want to challenge. For our initial approach, treat requests to the search results
14+
# endpoint as subject to challenge if they're flagged as bots.
15+
def self.should_challenge?(request)
16+
return false unless bot?(request)
17+
18+
# Basic rule: crawling search results or record pages triggers a challenge.
19+
# /results is the search results page and /record is the full record view.
20+
# This keeps the rule simple and conservative.
21+
path = request.path.to_s
22+
return true if path.start_with?('/results') || path.start_with?('/record')
23+
24+
false
25+
end
26+
end

app/models/feature.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
#
3434
class Feature
3535
# List of all valid features in the application
36-
VALID_FEATURES = %i[geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all
36+
VALID_FEATURES = %i[bot_detection geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all
3737
tab_timdex_alma record_link timdex_fulltext].freeze
3838

3939
# Check if a feature is enabled by name

app/views/turnstile/show.html.erb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<%= cloudflare_turnstile_script_tag %>
2+
3+
<section class="turnstile-challenge">
4+
<div class="turnstile-challenge__inner">
5+
<h1>Verify you're human</h1>
6+
<p>
7+
Please complete this verification to continue.
8+
</p>
9+
10+
<%= form_with url: turnstile_verify_path, method: :post, local: true do %>
11+
<%= hidden_field_tag :return_to, @return_to %>
12+
13+
<div class="turnstile-widget">
14+
<%= cloudflare_turnstile(action: 'search') %>
15+
</div>
16+
17+
<div class="turnstile-challenge__actions">
18+
<%= submit_tag 'Submit', class: 'btn button-primary' %>
19+
</div>
20+
<% end %>
21+
</div>
22+
</section>
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Explicitly require Feature model to check if bot detection is enabled
2+
require Rails.root.join('app/models/feature')
3+
4+
module TurnstileConfig
5+
module_function
6+
7+
def apply
8+
RailsCloudflareTurnstile.reset_configuration!
9+
enabled = bot_detection_enabled?
10+
enabled = false if Rails.env.test?
11+
12+
RailsCloudflareTurnstile.configure do |config|
13+
config.site_key = ENV['TURNSTILE_SITEKEY']
14+
config.secret_key = ENV['TURNSTILE_SECRET']
15+
config.enabled = enabled
16+
config.fail_open = !enabled
17+
config.mock_enabled = Rails.env.test?
18+
end
19+
end
20+
21+
def bot_detection_enabled?
22+
return false unless Feature.enabled?(:bot_detection)
23+
24+
# Check that required env is present
25+
sitekey = ENV.fetch('TURNSTILE_SITEKEY', nil)
26+
secret = ENV.fetch('TURNSTILE_SECRET', nil)
27+
28+
if sitekey.blank? || secret.blank?
29+
Rails.logger.error('Bot detection enabled but missing TURNSTILE_SITEKEY or TURNSTILE_SECRET')
30+
Sentry.capture_message('Bot detection misconfigured: missing Turnstile credentials', level: :error)
31+
return false
32+
end
33+
34+
true
35+
end
36+
end
37+
38+
TurnstileConfig.apply

0 commit comments

Comments
 (0)