Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives
		
			
				
	
	
		
			203 lines
		
	
	
	
		
			5.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			203 lines
		
	
	
	
		
			5.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| # frozen_string_literal: true
 | |
| 
 | |
| class SpamCheck
 | |
|   include Redisable
 | |
|   include ActionView::Helpers::TextHelper
 | |
| 
 | |
|   # Threshold over which two Nilsimsa values are considered
 | |
|   # to refer to the same text
 | |
|   NILSIMSA_COMPARE_THRESHOLD = 95
 | |
| 
 | |
|   # Nilsimsa doesn't work well on small inputs, so below
 | |
|   # this size, we check only for exact matches with MD5
 | |
|   NILSIMSA_MIN_SIZE = 10
 | |
| 
 | |
|   # How long to keep the trail of digests between updates,
 | |
|   # there is no reason to store it forever
 | |
|   EXPIRE_SET_AFTER = 1.week.seconds
 | |
| 
 | |
|   # How many digests to keep in an account's trail. If it's
 | |
|   # too small, spam could rotate around different message templates
 | |
|   MAX_TRAIL_SIZE = 10
 | |
| 
 | |
|   # How many detected duplicates to allow through before
 | |
|   # considering the message as spam
 | |
|   THRESHOLD = 5
 | |
| 
 | |
|   def initialize(status)
 | |
|     @account = status.account
 | |
|     @status  = status
 | |
|   end
 | |
| 
 | |
|   def skip?
 | |
|     disabled? || already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply?
 | |
|   end
 | |
| 
 | |
|   def spam?
 | |
|     if insufficient_data?
 | |
|       false
 | |
|     elsif nilsimsa?
 | |
|       digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
 | |
|     else
 | |
|       digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   def flag!
 | |
|     auto_silence_account!
 | |
|     auto_report_status!
 | |
|   end
 | |
| 
 | |
|   def remember!
 | |
|     # The scores in sorted sets don't actually have enough bits to hold an exact
 | |
|     # value of our snowflake IDs, so we use it only for its ordering property. To
 | |
|     # get the correct status ID back, we have to save it in the string value
 | |
| 
 | |
|     redis.zadd(redis_key, @status.id, digest_with_algorithm)
 | |
|     redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
 | |
|     redis.expire(redis_key, EXPIRE_SET_AFTER)
 | |
|   end
 | |
| 
 | |
|   def reset!
 | |
|     redis.del(redis_key)
 | |
|   end
 | |
| 
 | |
|   def hashable_text
 | |
|     return @hashable_text if defined?(@hashable_text)
 | |
| 
 | |
|     @hashable_text = @status.text
 | |
|     @hashable_text = remove_mentions(@hashable_text)
 | |
|     @hashable_text = strip_tags(@hashable_text) unless @status.local?
 | |
|     @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text)
 | |
|     @hashable_text = remove_whitespace(@hashable_text)
 | |
|   end
 | |
| 
 | |
|   def insufficient_data?
 | |
|     hashable_text.blank?
 | |
|   end
 | |
| 
 | |
|   def digest
 | |
|     @digest ||= begin
 | |
|       if nilsimsa?
 | |
|         Nilsimsa.new(hashable_text).hexdigest
 | |
|       else
 | |
|         Digest::MD5.hexdigest(hashable_text)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   def digest_with_algorithm
 | |
|     if nilsimsa?
 | |
|       ['nilsimsa', digest, @status.id].join(':')
 | |
|     else
 | |
|       ['md5', digest, @status.id].join(':')
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   class << self
 | |
|     def perform(status)
 | |
|       spam_check = new(status)
 | |
| 
 | |
|       return if spam_check.skip?
 | |
| 
 | |
|       if spam_check.spam?
 | |
|         spam_check.flag!
 | |
|       else
 | |
|         spam_check.remember!
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   private
 | |
| 
 | |
|   def disabled?
 | |
|     !Setting.spam_check_enabled
 | |
|   end
 | |
| 
 | |
|   def remove_mentions(text)
 | |
|     return text.gsub(Account::MENTION_RE, '') if @status.local?
 | |
| 
 | |
|     Nokogiri::HTML.fragment(text).tap do |html|
 | |
|       mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) }
 | |
| 
 | |
|       html.traverse do |element|
 | |
|         element.unlink if element.name == 'a' && mentions.include?(element['href'])
 | |
|       end
 | |
|     end.to_s
 | |
|   end
 | |
| 
 | |
|   def normalize_unicode(text)
 | |
|     text.unicode_normalize(:nfkc).downcase
 | |
|   end
 | |
| 
 | |
|   def remove_whitespace(text)
 | |
|     text.gsub(/\s+/, ' ').strip
 | |
|   end
 | |
| 
 | |
|   def auto_silence_account!
 | |
|     @account.silence!
 | |
|   end
 | |
| 
 | |
|   def auto_report_status!
 | |
|     status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable?
 | |
|     ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected_and_silenced'))
 | |
|   end
 | |
| 
 | |
|   def already_flagged?
 | |
|     @account.silenced?
 | |
|   end
 | |
| 
 | |
|   def trusted?
 | |
|     @account.trust_level > Account::TRUST_LEVELS[:untrusted]
 | |
|   end
 | |
| 
 | |
|   def no_unsolicited_mentions?
 | |
|     @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) }
 | |
|   end
 | |
| 
 | |
|   def solicited_reply?
 | |
|     !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists?
 | |
|   end
 | |
| 
 | |
|   def nilsimsa_compare_value(first, second)
 | |
|     first  = [first].pack('H*')
 | |
|     second = [second].pack('H*')
 | |
|     bits   = 0
 | |
| 
 | |
|     0.upto(31) do |i|
 | |
|       bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord
 | |
|     end
 | |
| 
 | |
|     128 - bits # -128 <= Nilsimsa Compare Value <= 128
 | |
|   end
 | |
| 
 | |
|   def nilsimsa?
 | |
|     hashable_text.size > NILSIMSA_MIN_SIZE
 | |
|   end
 | |
| 
 | |
|   def other_digests
 | |
|     redis.zrange(redis_key, 0, -1)
 | |
|   end
 | |
| 
 | |
|   def digests_over_threshold?(filter_algorithm)
 | |
|     other_digests.select do |record|
 | |
|       algorithm, other_digest, status_id = record.split(':')
 | |
| 
 | |
|       next unless algorithm == filter_algorithm
 | |
| 
 | |
|       yield algorithm, other_digest, status_id
 | |
|     end.size >= THRESHOLD
 | |
|   end
 | |
| 
 | |
|   def matching_status_ids
 | |
|     if nilsimsa?
 | |
|       other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact
 | |
|     else
 | |
|       other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   def redis_key
 | |
|     @redis_key ||= "spam_check:#{@account.id}"
 | |
|   end
 | |
| end
 |