From 115ab2869b2742f0cc68116a8c03359d220fd608 Mon Sep 17 00:00:00 2001 From: Partho Ghosh Date: Tue, 3 Jan 2023 17:12:48 -0800 Subject: [PATCH] =?UTF-8?q?Fix=20=E3=83=BB=20detection=20in=20hashtag=20re?= =?UTF-8?q?gex=20to=20construct=20hashtag=20correctly=20(#22888)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix ・ detection in hashtag regex to construct hashtag correctly * Fixed rubocop liniting issues * More rubocop linting fix --- app/models/tag.rb | 22 +++++++++++---- spec/models/tag_spec.rb | 59 +++++++++++++++++++++++------------------ 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/app/models/tag.rb b/app/models/tag.rb index b66f85423..47a05d00a 100644 --- a/app/models/tag.rb +++ b/app/models/tag.rb @@ -26,8 +26,12 @@ class Tag < ApplicationRecord has_many :featured_tags, dependent: :destroy, inverse_of: :tag has_many :followers, through: :passive_relationships, source: :account - HASHTAG_SEPARATORS = "_\u00B7\u200c" - HASHTAG_NAME_PAT = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)" + HASHTAG_SEPARATORS = "_\u00B7\u30FB\u200c" + HASHTAG_FIRST_SEQUENCE_CHUNK_ONE = "[[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}]" + HASHTAG_FIRST_SEQUENCE_CHUNK_TWO = "[[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_]" + HASHTAG_FIRST_SEQUENCE = "(#{HASHTAG_FIRST_SEQUENCE_CHUNK_ONE}#{HASHTAG_FIRST_SEQUENCE_CHUNK_TWO})" + HASTAG_LAST_SEQUENCE = '([[:word:]_]*[[:alpha:]][[:word:]_]*)' + HASHTAG_NAME_PAT = "#{HASHTAG_FIRST_SEQUENCE}|#{HASTAG_LAST_SEQUENCE}" HASHTAG_RE = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_PAT})/i HASHTAG_NAME_RE = /\A(#{HASHTAG_NAME_PAT})\z/i @@ -45,7 +49,11 @@ class Tag < ApplicationRecord scope :listable, -> { where(listable: [true, nil]) } scope :trendable, -> { Setting.trendable_by_default ? where(trendable: [true, nil]) : where(trendable: true) } scope :not_trendable, -> { where(trendable: false) } - scope :recently_used, ->(account) { joins(:statuses).where(statuses: { id: account.statuses.select(:id).limit(1000) }).group(:id).order(Arel.sql('count(*) desc')) } + scope :recently_used, ->(account) { + joins(:statuses) + .where(statuses: { id: account.statuses.select(:id).limit(1000) }) + .group(:id).order(Arel.sql('count(*) desc')) + } scope :matches_name, ->(term) { where(arel_table[:name].lower.matches(arel_table.lower("#{sanitize_sql_like(Tag.normalize(term))}%"), nil, true)) } # Search with case-sensitive to use B-tree index update_index('tags', :self) @@ -105,7 +113,8 @@ class Tag < ApplicationRecord names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first) names.map do |(normalized_name, display_name)| - tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name.gsub(HASHTAG_INVALID_CHARS_RE, '')) + tag = matching_name(normalized_name).first || create(name: normalized_name, + display_name: display_name.gsub(HASHTAG_INVALID_CHARS_RE, '')) yield tag if block_given? @@ -154,6 +163,9 @@ class Tag < ApplicationRecord end def validate_display_name_change - errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero? + unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero? + errors.add(:display_name, + I18n.t('tags.does_not_match_previous_name')) + end end end diff --git a/spec/models/tag_spec.rb b/spec/models/tag_spec.rb index b16f99a79..102d2f625 100644 --- a/spec/models/tag_spec.rb +++ b/spec/models/tag_spec.rb @@ -1,21 +1,22 @@ +# frozen_string_literal: true require 'rails_helper' -RSpec.describe Tag, type: :model do +RSpec.describe Tag do describe 'validations' do it 'invalid with #' do - expect(Tag.new(name: '#hello_world')).to_not be_valid + expect(described_class.new(name: '#hello_world')).not_to be_valid end it 'invalid with .' do - expect(Tag.new(name: '.abcdef123')).to_not be_valid + expect(described_class.new(name: '.abcdef123')).not_to be_valid end it 'invalid with spaces' do - expect(Tag.new(name: 'hello world')).to_not be_valid + expect(described_class.new(name: 'hello world')).not_to be_valid end it 'valid with aesthetic' do - expect(Tag.new(name: 'aesthetic')).to be_valid + expect(described_class.new(name: 'aesthetic')).to be_valid end end @@ -62,6 +63,10 @@ RSpec.describe Tag, type: :model do expect(subject.match('hello #one·two·three').to_s).to eq ' #one·two·three' end + it 'matches ・unicode in ぼっち・ざ・ろっく correctly' do + expect(subject.match('testing #ぼっち・ざ・ろっく').to_s).to eq ' #ぼっち・ざ・ろっく' + end + it 'matches ZWNJ' do expect(subject.match('just add #نرم‌افزار and').to_s).to eq ' #نرم‌افزار' end @@ -89,44 +94,46 @@ RSpec.describe Tag, type: :model do describe '.find_normalized' do it 'returns tag for a multibyte case-insensitive name' do upcase_string = 'abcABCabcABCやゆよ' - downcase_string = 'abcabcabcabcやゆよ'; + downcase_string = 'abcabcabcabcやゆよ' tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) - expect(Tag.find_normalized(upcase_string)).to eq tag + expect(described_class.find_normalized(upcase_string)).to eq tag end end describe '.matches_name' do it 'returns tags for multibyte case-insensitive names' do upcase_string = 'abcABCabcABCやゆよ' - downcase_string = 'abcabcabcabcやゆよ'; + downcase_string = 'abcabcabcabcやゆよ' tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) - expect(Tag.matches_name(upcase_string)).to eq [tag] + expect(described_class.matches_name(upcase_string)).to eq [tag] end it 'uses the LIKE operator' do - expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')] + result = %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')] + expect(described_class.matches_name('100%abc').to_sql).to eq result end end describe '.matching_name' do it 'returns tags for multibyte case-insensitive names' do upcase_string = 'abcABCabcABCやゆよ' - downcase_string = 'abcabcabcabcやゆよ'; + downcase_string = 'abcabcabcabcやゆよ' tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) - expect(Tag.matching_name(upcase_string)).to eq [tag] + expect(described_class.matching_name(upcase_string)).to eq [tag] end end describe '.find_or_create_by_names' do - it 'runs a passed block once per tag regardless of duplicates' do - upcase_string = 'abcABCabcABCやゆよ' - downcase_string = 'abcabcabcabcやゆよ'; - count = 0 + let(:upcase_string) { 'abcABCabcABCやゆよ' } + let(:downcase_string) { 'abcabcabcabcやゆよ' } - Tag.find_or_create_by_names([upcase_string, downcase_string]) do |tag| + it 'runs a passed block once per tag regardless of duplicates' do + count = 0 + + described_class.find_or_create_by_names([upcase_string, downcase_string]) do |_tag| count += 1 end @@ -136,28 +143,28 @@ RSpec.describe Tag, type: :model do describe '.search_for' do it 'finds tag records with matching names' do - tag = Fabricate(:tag, name: "match") - _miss_tag = Fabricate(:tag, name: "miss") + tag = Fabricate(:tag, name: 'match') + _miss_tag = Fabricate(:tag, name: 'miss') - results = Tag.search_for("match") + results = described_class.search_for('match') expect(results).to eq [tag] end it 'finds tag records in case insensitive' do - tag = Fabricate(:tag, name: "MATCH") - _miss_tag = Fabricate(:tag, name: "miss") + tag = Fabricate(:tag, name: 'MATCH') + _miss_tag = Fabricate(:tag, name: 'miss') - results = Tag.search_for("match") + results = described_class.search_for('match') expect(results).to eq [tag] end it 'finds the exact matching tag as the first item' do - similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc) - tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc) + similar_tag = Fabricate(:tag, name: 'matchlater', reviewed_at: Time.now.utc) + tag = Fabricate(:tag, name: 'match', reviewed_at: Time.now.utc) - results = Tag.search_for("match") + results = described_class.search_for('match') expect(results).to eq [tag, similar_tag] end