merge

jxmorris12 · jxmorris12 · commit ca6f2733e403 · 2022-11-01T11:28:48.000-04:00
diff --git a/textattack/datasets/helpers/ted_multi.py b/textattack/datasets/helpers/ted_multi.py
@@ -20,7 +20,7 @@ class TedMultiTranslationDataset(HuggingFaceDataset):
     dataset source: http://www.cs.jhu.edu/~kevinduh/a/multitarget-tedtalks/
     """
 
-    def __init__(self, source_lang="en", target_lang="de", split="test"):
+    def __init__(self, source_lang="en", target_lang="de", split="test", shuffle=False):
         self._dataset = datasets.load_dataset("ted_multi")[split]
         self.examples = self._dataset["translations"]
         language_options = set(self.examples[0]["language"])
@@ -34,6 +34,9 @@ def __init__(self, source_lang="en", target_lang="de", split="test"):
             )
         self.source_lang = source_lang
         self.target_lang = target_lang
+        self.shuffled = shuffle
+        if shuffle:
+            self._dataset.shuffle()
 
     def _format_raw_example(self, raw_example):
         translations = np.array(raw_example["translation"])
diff --git a/textattack/shared/attacked_text.py b/textattack/shared/attacked_text.py
@@ -82,7 +82,11 @@ def __eq__(self, other: AttackedText) -> bool:
         and it's actually much faster (cache-wise) to just compare
         by the text, and this works for lots of use cases.
         """
-        return self.text == other.text
+        if not (self.text == other.text):
+            return False
+        if len(self.attack_attrs) != len(other.attack_attrs):
+            return False
+        return True
 
     def __hash__(self) -> int:
         return hash(self.text)
@@ -466,9 +470,6 @@ def generate_new_attacked_text(self, new_words: Iterable[str]) -> AttackedText:
             perturbed_text += adv_word_seq
         perturbed_text += original_text  # Add all of the ending punctuation.
 
-        # Add pointer to self so chain of replacements can be reconstructed.
-        new_attack_attrs["prev_attacked_text"] = self
-
         # Reform perturbed_text into an OrderedDict.
         perturbed_input_texts = perturbed_text.split(AttackedText.SPLIT_TOKEN)
         perturbed_input = OrderedDict(
@@ -570,7 +571,10 @@ def num_words(self) -> int:
 
     @property
     def newly_swapped_words(self) -> List[str]:
-        return [self.words[i] for i in self.attack_attrs["newly_modified_indices"]]
+        return [
+            self.attack_attrs["prev_attacked_text"].words[i]
+            for i in self.attack_attrs["newly_modified_indices"]
+        ]
 
     def printable_text(self, key_color="bold", key_color_method=None) -> str:
         """Represents full text input. Adds field descriptions.
diff --git a/textattack/shared/utils/strings.py b/textattack/shared/utils/strings.py
@@ -2,8 +2,6 @@
 import string
 
 import flair
-import jieba
-import pycld2 as cld2
 
 from .importing import LazyLoader
 
@@ -32,15 +30,6 @@ def add_indent(s_, numSpaces):
 def words_from_text(s, words_to_ignore=[]):
     """Lowercases a string, removes all non-alphanumeric characters, and splits
     into words."""
-    # try:
-    #     isReliable, textBytesFound, details = cld2.detect(s)
-    #     if details[0][0] == "Chinese" or details[0][0] == "ChineseT":
-    #         seg_list = jieba.cut(s, cut_all=False)
-    #         s = " ".join(seg_list)
-    #     else:
-    #         s = " ".join(s.split())
-    # except Exception:
-    #     s = " ".join(s.split())
     s = " ".join(s.split())
 
     homos = """˗৭Ȣ𝟕бƼᏎƷᒿlO`ɑЬϲԁе𝚏ɡհіϳ𝒌ⅼｍոорԛⲅѕ𝚝սѵԝ×уᴢ"""