Skip to content

[ENG-5862] SPAM - Fix Wiki Spamming #11171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: feature/pbs-25-10
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 55 additions & 28 deletions osf/external/spam/tasks.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import re
import logging
import requests
from framework import sentry
from framework.celery_tasks import app as celery_app
from framework.postcommit_tasks.handlers import run_postcommit
from django.contrib.contenttypes.models import ContentType
from django.db import transaction
from osf.external.askismet.client import AkismetClient
from osf.external.oopspam.client import OOPSpamClient
from osf.utils.fields import ensure_str
from website import settings

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -38,11 +40,8 @@ def reclassify_domain_references(notable_domain_id, current_note, previous_note)
item.referrer.save()


def _check_resource_for_domains(guid, content):
from osf.models import Guid, NotableDomain, DomainReference
resource, _ = Guid.load_referent(guid)
if not resource:
return f'{guid} not found'
def _check_resource_for_domains(resource, content):
from osf.models import NotableDomain, DomainReference

spammy_domains = []
referrer_content_type = ContentType.objects.get_for_model(resource)
Expand All @@ -51,7 +50,7 @@ def _check_resource_for_domains(guid, content):
domain=domain,
defaults={'note': note}
)
if notable_domain.note == NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT:
if notable_domain.note == NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT.value:
spammy_domains.append(notable_domain.domain)
DomainReference.objects.get_or_create(
domain=notable_domain,
Expand All @@ -61,19 +60,8 @@ def _check_resource_for_domains(guid, content):
'is_triaged': notable_domain.note not in (NotableDomain.Note.UNKNOWN, NotableDomain.Note.UNVERIFIED)
}
)
if spammy_domains:
resource.confirm_spam(save=True, domains=list(spammy_domains))


@run_postcommit(once_per_request=False, celery=True)
@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60)
def check_resource_for_domains_postcommit(guid, content):
_check_resource_for_domains(guid, content)


@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60)
def check_resource_for_domains_async(guid, content):
_check_resource_for_domains(guid, content)
return spammy_domains


def _extract_domains(content):
Expand Down Expand Up @@ -111,16 +99,11 @@ def _extract_domains(content):
yield domain, note


@run_postcommit(once_per_request=False, celery=True)
@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60)
def check_resource_with_spam_services(guid, content, author, author_email, request_kwargs):
def check_resource_with_spam_services(resource, content, author, author_email, request_kwargs):
"""
Return statements used only for debugging and recording keeping
"""
any_is_spam = False
from osf.models import Guid, OSFUser
guid = Guid.load(guid)
resource = guid.referent

kwargs = dict(
user_ip=request_kwargs.get('remote_addr'),
Expand Down Expand Up @@ -163,10 +146,54 @@ def check_resource_with_spam_services(guid, content, author, author_email, reque
resource.spam_data['author_email'] = author_email
resource.flag_spam()

if hasattr(resource, 'check_spam_user'):
user = OSFUser.objects.get(username=author_email)
resource.check_spam_user(user)
return any_is_spam


@run_postcommit(once_per_request=False, celery=True)
@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60)
def check_resource_for_spam_postcommit(guid, content, author, author_email, request_headers):
from osf.models import Guid, OSFUser

resource, _ = Guid.load_referent(guid)
if not resource:
return f'{guid} not found'

spammy_domains = _check_resource_for_domains(resource, content)
if spammy_domains:
sentry.log_message(f"Spammy domains detected for {guid}: {spammy_domains}")
resource.confirm_spam(save=False, domains=list(spammy_domains))
elif settings.SPAM_SERVICES_ENABLED:
request_kwargs = {
'remote_addr': request_headers.get('Remote-Addr') or request_headers.get('Host'), # for local testing
'user_agent': request_headers.get('User-Agent'),
'referer': request_headers.get('Referer'),
}
for key, value in request_kwargs.items():
request_kwargs[key] = ensure_str(value)

check_resource_with_spam_services(
resource,
content,
author,
author_email,
request_kwargs,
)

resource.save()

return f'{resource} is spam: {any_is_spam} {resource.spam_data.get("content")}'
if hasattr(resource, 'check_spam_user'):
user = OSFUser.objects.get(username=author_email)
resource.check_spam_user(user)


@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60)
def check_resource_for_domains_async(guid, content):
from osf.models import Guid

resource, _ = Guid.load_referent(guid)
if not resource:
return f'{guid} not found'

spammy_domains = _check_resource_for_domains(resource, content)
if spammy_domains:
resource.confirm_spam(save=True, domains=list(spammy_domains))
28 changes: 7 additions & 21 deletions osf/models/spam.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@

from osf.exceptions import ValidationValueError, ValidationTypeError
from osf.external.askismet import tasks as akismet_tasks
from osf.external.spam.tasks import check_resource_for_domains_postcommit, check_resource_with_spam_services
from osf.external.spam.tasks import check_resource_for_spam_postcommit
from osf.utils.datetime_aware_jsonfield import DateTimeAwareJSONField
from osf.utils.fields import ensure_str, NonNaiveDateTimeField
from website import settings
from osf.utils.fields import NonNaiveDateTimeField

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -197,28 +196,15 @@ def do_check_spam(self, author, author_email, content, request_headers):
if self.is_spammy:
return

request_kwargs = {
'remote_addr': request_headers.get('Remote-Addr') or request_headers.get('Host'), # for local testing
'user_agent': request_headers.get('User-Agent'),
'referer': request_headers.get('Referer'),
}
if isinstance(self, Preprint):
guid__id = self._id
else:
guid__id = self.guids.first()._id
check_resource_for_domains_postcommit(

check_resource_for_spam_postcommit(
guid__id,
content,
author,
author_email,
request_headers,
)

if settings.SPAM_SERVICES_ENABLED:
for key, value in request_kwargs.items():
request_kwargs[key] = ensure_str(value)

check_resource_with_spam_services(
guid__id,
content,
author,
author_email,
request_kwargs,
)
135 changes: 135 additions & 0 deletions osf_tests/test_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
from .factories import get_default_metaschema
from addons.wiki.tests.factories import WikiVersionFactory, WikiFactory
from osf_tests.utils import capture_signals, assert_datetime_equal, mock_archive
from osf.models.spam import SpamStatus
from osf.external.spam import tasks as spam_tasks

pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -2335,6 +2337,38 @@ def test_check_spam_only_public_node_by_default(self, project, user):
project.check_spam(user, None, None)
assert not project.is_public

@mock.patch('osf.models.node.get_request_and_user_id')
def test_do_check_spam_called_on_set_public(self, mock_get_request, project, user):
mock_request = {
'headers': {
'Remote-Addr': '1.2.3.4',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Referer': 'https://osf.io'
}
}
mock_get_request.return_value = (mock_request, user._id)

project.title = 'Spam'
project.description = 'spammy content'
project.is_public = False
project.save()

wiki = WikiFactory(node=project, user=user)
WikiVersionFactory(wiki_page=wiki, content='Some wiki content')

with mock.patch.object(Node, 'do_check_spam') as mock_do_check_spam:
mock_do_check_spam.return_value = False
project.set_privacy('public', auth=Auth(user))

mock_do_check_spam.assert_called_once()
args = mock_do_check_spam.call_args[0]
assert args[0] == user.fullname # author
assert args[1] == user.username # author email
# content
assert 'Some wiki content' in args[2]
assert project.title in args[2]
assert project.description in args[2]

@mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', True)
def test_check_spam_skips_ham_user(self, project, user):
with mock.patch('osf.models.AbstractNode._get_spam_content', mock.Mock(return_value='some content!')):
Expand Down Expand Up @@ -2464,6 +2498,107 @@ def test_multiple_privacy_changing(self, project):
project.confirm_ham()
assert not project.is_public

def test_get_spam_content_includes_wiki_content(self, project, user):
wiki = WikiFactory(node=project, user=user)
WikiVersionFactory(wiki_page=wiki, content='Some wiki content')

content = project._get_spam_content()
assert 'Some wiki content' in content

project.title = 'Test Title'
project.save()
content = project._get_spam_content()
assert 'Test Title' in content
assert 'Some wiki content' in content


class TestCheckResourceForSpamPostcommit:

@pytest.fixture()
def user(self):
return UserFactory()

@pytest.fixture()
def project(self, user):
return ProjectFactory(creator=user)

@pytest.fixture()
def request_headers(self):
return {
'Remote-Addr': '1.2.3.4',
'User-Agent': 'Mozilla/5.0',
'Referer': 'https://osf.io'
}

@mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', True)
@mock.patch('osf.external.spam.tasks._check_resource_for_domains')
def test_check_resource_for_spam_postcommit_with_spammy_domains(self, mock_check_domains, project, user):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

THe ticket instructions to reproduce the bug are:

  1. Create project
  2. Update wiki with spammy domain
  3. Make project public

These are the steps the tests should follow, you are testing behavior that is too specific we know checking for spam domans works, when a project is saved when public, but we need to check a project's wiki content too, not just it's desciption etc.

mock_check_domains.return_value = ['spam.com']
with mock.patch('osf.external.spam.tasks.check_resource_with_spam_services') as mock_check_services:
spam_tasks.check_resource_for_spam_postcommit(
guid=project._id,
content='Check me for spam at spam.com',
author=user.fullname,
author_email=user.username,
request_headers={}
)
project.reload()
assert project.spam_status == SpamStatus.SPAM
assert project.spam_data['domains'] == ['spam.com']
mock_check_services.assert_not_called()

@mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', True)
@mock.patch('osf.external.spam.tasks._check_resource_for_domains')
def test_check_resource_for_spam_postcommit_no_spammy_domains_checks_services(self, mock_check_domains, project, user, request_headers):
mock_check_domains.return_value = []
with mock.patch('osf.external.spam.tasks.check_resource_with_spam_services') as mock_check_services:
mock_check_services.return_value = True
spam_tasks.check_resource_for_spam_postcommit(
guid=project._id,
content='Check me for spam',
author=user.fullname,
author_email=user.username,
request_headers=request_headers
)
mock_check_services.assert_called_once_with(
project,
'Check me for spam',
user.fullname,
user.username,
{
'remote_addr': '1.2.3.4',
'user_agent': 'Mozilla/5.0',
'referer': 'https://osf.io'
}
)

@mock.patch('osf.external.spam.tasks._check_resource_for_domains')
@mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', False)
def test_check_resource_for_spam_postcommit_no_spammy_domains_services_disabled(self, mock_check_domains, project, user):
mock_check_domains.return_value = []
with mock.patch('osf.external.spam.tasks.check_resource_with_spam_services') as mock_check_services:
spam_tasks.check_resource_for_spam_postcommit(
guid=project._id,
content='Check me for spam',
author=user.fullname,
author_email=user.username,
request_headers={}
)
mock_check_services.assert_not_called()

@mock.patch('osf.external.spam.tasks._check_resource_for_domains')
def test_check_resource_for_spam_postcommit_checks_user(self, mock_check_domains, project, user, request_headers):
mock_check_domains.return_value = []
with mock.patch.object(Node, 'check_spam_user') as mock_check_user:
spam_tasks.check_resource_for_spam_postcommit(
guid=project._id,
content='Check me for spam',
author=user.fullname,
author_email=user.username,
request_headers=request_headers
)
mock_check_user.assert_called_once_with(user)


# copied from tests/test_models.py
class TestPrivateLinks:
Expand Down
Loading
Loading