diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py index ecf2825d4e8..a8f4292c33e 100644 --- a/api/base/elasticsearch_dsl_views.py +++ b/api/base/elasticsearch_dsl_views.py @@ -3,8 +3,9 @@ import datetime import typing -import elasticsearch6_dsl as edsl +import elasticsearch8.dsl as esdsl from rest_framework import generics, exceptions as drf_exceptions +from rest_framework.serializers import Serializer from rest_framework.settings import api_settings as drf_settings from api.base.settings.defaults import REPORT_FILENAME_FORMAT @@ -23,7 +24,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): - '''abstract view class using `elasticsearch6_dsl.Search` as a queryset-analogue + '''abstract view class using `elasticsearch8.dsl.Search` as a queryset-analogue builds a `Search` based on `self.get_default_search()` and the request's query parameters for filtering, sorting, and pagination -- fetches only @@ -35,18 +36,18 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, ordering_fields: frozenset[str] = frozenset() # serializer field names @abc.abstractmethod - def get_default_search(self) -> edsl.Search | None: - '''the base `elasticsearch6_dsl.Search` for this list, based on url path + def get_default_search(self) -> esdsl.Search | None: + '''the base `elasticsearch8.dsl.Search` for this list, based on url path (common jsonapi query parameters will be considered automatically) ''' ... - FILE_RENDERER_CLASSES = { + FILE_RENDERER_CLASSES = ( MetricsReportsCsvRenderer, MetricsReportsTsvRenderer, MetricsReportsJsonRenderer, - } + ) def set_content_disposition(self, response, renderer: str): """Set the Content-Disposition header to prompt a file download with the appropriate filename. @@ -75,7 +76,7 @@ def finalize_response(self, request, response, *args, **kwargs): response = super().finalize_response(request, response, *args, **kwargs) # Check if this is a direct download request or file renderer classes, set to the Content-Disposition header # so filename and attachment for browser download - if isinstance(request.accepted_renderer, tuple(self.FILE_RENDERER_CLASSES)): + if isinstance(request.accepted_renderer, self.FILE_RENDERER_CLASSES): self.set_content_disposition(response, request.accepted_renderer) return response @@ -95,7 +96,7 @@ def finalize_response(self, request, response, *args, **kwargs): # (filtering handled in-view to reuse logic from FilterMixin) filter_backends = () - # note: because elasticsearch6_dsl.Search supports slicing and gives results when iterated on, + # note: because elasticsearch8.dsl.Search supports slicing and gives results when iterated on, # it works fine with default pagination # override rest_framework.generics.GenericAPIView @@ -128,10 +129,17 @@ def get_queryset(self): ) return self.__add_sort(_search) + def get_serializer_context(self): + return ( + super().get_serializer_context() + if issubclass(self.get_serializer_class(), Serializer) + else {} # allow custom BaseSerializer-based serializer + ) + ### # private methods - def __add_sort(self, search: edsl.Search) -> edsl.Search: + def __add_sort(self, search: esdsl.Search) -> esdsl.Search: _elastic_sort = self.__get_elastic_sort() return (search if _elastic_sort is None else search.sort(_elastic_sort)) @@ -148,17 +156,20 @@ def __get_elastic_sort(self) -> str | None: raise drf_exceptions.ValidationError( f'invalid value for {drf_settings.ORDERING_PARAM} query param (valid values: {", ".join(self.ordering_fields)})', ) - _serializer_field = self.get_serializer().fields[_sort_field] - _elastic_sort_field = _serializer_field.source + _elastic_sort_field = ( + self.get_serializer().fields[_sort_field].source + if issubclass(self.get_serializer_class(), Serializer) + else _sort_field # allow custom BaseSerializer-based serializer + ) return (_elastic_sort_field if _ascending else f'-{_elastic_sort_field}') def __add_search_filter( self, - search: edsl.Search, + search: esdsl.Search, elastic_field_name: str, operator: str, value: str, - ) -> edsl.Search: + ) -> esdsl.Search: match operator: # operators from FilterMixin case 'eq': if value == '': diff --git a/api/base/metrics.py b/api/base/metrics.py index d68f19a45b8..25208784131 100644 --- a/api/base/metrics.py +++ b/api/base/metrics.py @@ -1,15 +1,16 @@ -import re -from datetime import timedelta - +import abc import waffle -from django.utils import timezone from api.base.exceptions import InvalidQueryStringError from osf import features -from website.settings import PREPRINT_METRICS_START_DATE +from osf.metrics.es8_metrics import ( + OsfCountedUsageEvent, + MonthlyPublicItemUsageReportEs8, +) +from osf.models.base import osfid_iri -class MetricsViewMixin: +class UsageMetricsViewMixin(abc.ABC): """Mixin for views that expose metrics via django-elasticsearch-metrics. Enables metrics to be requested with a query parameter, like so: :: @@ -18,110 +19,98 @@ class MetricsViewMixin: Any subclass of this mixin MUST do the following: * Use a serializer_class that subclasses MetricsSerializerMixin - * Define metric_map as a class variable. It should be dict mapping metric name - ("downloads") to a Metric class (PreprintDownload) - * For list views: implement `get_annotated_queryset_with_metrics` - * For detail views: implement `add_metric_to_object` + * Call add_metrics_to_object(obj) to get `views` and/or `downloads` + assigned on the obj (according to query params) """ - # Adapted from FilterMixin.QUERY_PATTERN - METRICS_QUERY_PATTERN = re.compile(r'^metrics\[(?P((?:,*\s*\w+)*))\]$') - TIMEDELTA_MAP = { - 'daily': timedelta(hours=24), - 'weekly': timedelta(days=7), - 'monthly': timedelta(days=30), - 'yearly': timedelta(days=365), + METRICS_QUERY_MAP = { + 'metrics[views]': OsfCountedUsageEvent.ActionLabel.VIEW, + 'metrics[downloads]': OsfCountedUsageEvent.ActionLabel.DOWNLOAD, + } + METRICS_ATTR_MAP = { + OsfCountedUsageEvent.ActionLabel.VIEW: 'views', + OsfCountedUsageEvent.ActionLabel.DOWNLOAD: 'downloads', + } + TIMESPAN_MAP = { + 'daily': 'now-1d/d', + 'weekly': 'now-1w/d', + 'monthly': 'now-1M/d', } VALID_METRIC_PERIODS = { 'daily', 'weekly', 'monthly', - 'yearly', 'total', } - @property - def metric_map(self): - raise NotImplementedError('MetricsViewMixin subclasses must define a metric_map class variable.') - - def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after): - """Return a queryset annotated with metrics. Use for list endpoints that expose metrics.""" - raise NotImplementedError('MetricsViewMixin subclasses must define get_annotated_queryset_with_metrics().') - - def add_metric_to_object(self, obj, metric_class, metric_name, after): - """Set an attribute for a metric on obj. Use for detail endpoints that expose metrics. - Return the modified object. - """ - raise NotImplementedError('MetricsViewMixin subclasses must define add_metric_to_object().') - - @property - def metrics_default_after(self): - """Value to be used as the `after` in metrics queries if not otherwise specified. - Datetime or None. - """ - return None - @property def metrics_requested(self): return ( - waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and - bool(self.parse_metric_query_params(self.request.query_params)) + waffle.switch_is_active(features.ELASTICSEARCH_METRICS) + and any(_param in self.METRICS_QUERY_MAP for _param in self.request.query_params) ) - # Adapted from FilterMixin.parse_query_params - # TODO: Should we get rid of query_params argument and use self.request.query_params instead? - def parse_metric_query_params(self, query_params): + def get_item_iri(self, item): + return osfid_iri(item._id) + + def parse_metric_query_params(self): """Parses query parameters to a dict usable for fetching metrics. :param dict query_params: :return dict of the format { - : { - 'period': <[daily|weekly|monthly|yearly|total]>, - } + : <[daily|weekly|monthly|yearly|total]>, } """ query = {} - for key, value in query_params.items(): - match = self.METRICS_QUERY_PATTERN.match(key) - if match: - match_dict = match.groupdict() - metric_name = match_dict['metric_name'] - query[metric_name] = value + for key, value in self.request.query_params.items(): + _usage_label = self.METRICS_QUERY_MAP.get(key) + if _usage_label: + if value not in self.VALID_METRIC_PERIODS: + raise InvalidQueryStringError(f"Invalid period for metric: '{value}'", parameter='metrics') + query[_usage_label] = value return query - def _add_metrics(self, queryset_or_obj, method): - """Parse the ?metric[METRIC]=PERIOD query param, validate it, and - run ``method`` for each requested object. - - This is used to share code between add_metric_to_object and get_metrics_queryset. + def add_metrics_to_object(self, obj): + """Helper method used for detail views. """ - metrics_requested = self.parse_metric_query_params(self.request.query_params) - if metrics_requested: - metric_map = self.metric_map - for metric, period in metrics_requested.items(): - if metric not in metric_map: - raise InvalidQueryStringError(f"Invalid metric in query string: '{metric}'", parameter='metrics') - if period not in self.VALID_METRIC_PERIODS: - raise InvalidQueryStringError(f"Invalid period for metric: '{period}'", parameter='metrics') - metric_class = metric_map[metric] - if period == 'total': - after = self.metrics_default_after + for _action_label, _period in self.parse_metric_query_params().items(): + _count = self._get_usage_count(self.get_item_iri(obj), _action_label, _period) + setattr(obj, self.METRICS_ATTR_MAP[_action_label], _count) + + def _get_usage_count(self, item_iri, action_label, period): + _search = ( + OsfCountedUsageEvent.search() + .filter('term', item_iri=item_iri) + .filter('term', action_labels=action_label.value) + ) + _prior_count = 0 + if _timespan := self.TIMESPAN_MAP.get(period): + _search = _search.filter('range', timestamp={'gte': _timespan}) + else: # cumulative total + _latest_usage_report = self._get_latest_usage_report(item_iri) + if _latest_usage_report: + _search = _search.filter( + 'range', timestamp={ + 'gte': _latest_usage_report.report_yearmonth.month_end(), + }, + ) + if action_label == OsfCountedUsageEvent.ActionLabel.VIEW: + _prior_count = _latest_usage_report.cumulative_view_count + elif action_label == OsfCountedUsageEvent.ActionLabel.DOWNLOAD: + _prior_count = _latest_usage_report.cumulative_download_count else: - after = timezone.now() - self.TIMEDELTA_MAP[period] - queryset_or_obj = method(queryset_or_obj, metric_class, metric, after) - return queryset_or_obj - - def add_metrics_to_object(self, obj): - """Helper method used for detail views.""" - return self._add_metrics(obj, method=self.add_metric_to_object) - - def get_metrics_queryset(self, queryset): - """Helper method used for list views.""" - return self._add_metrics(queryset, method=self.get_annotated_queryset_with_metrics) + raise ValueError(f'unsupported action label {action_label!r}') + _response = _search[0:0].execute() + return _prior_count + _response.doc_count + + def _get_latest_usage_report(self, item_iri): + _search = ( + MonthlyPublicItemUsageReportEs8.search() + .filter('term', item_iri=item_iri) + .sort('-cycle_coverage') + ) + _response = _search[0].execute() + return _response[0] if _response else None - # Override get_default_queryset for convenience - def get_default_queryset(self): - queryset = super().get_default_queryset() - return self.get_metrics_queryset(queryset) class MetricsSerializerMixin: @property @@ -138,9 +127,3 @@ def get_meta(self, obj): meta = meta or {'metrics': {}} meta['metrics'][metric] = getattr(obj, metric) return meta - - -class PreprintMetricsViewMixin(MetricsViewMixin): - @property - def metrics_default_after(self): - return PREPRINT_METRICS_START_DATE diff --git a/api/institutions/views.py b/api/institutions/views.py index d653f5b4e77..47ac52707ae 100644 --- a/api/institutions/views.py +++ b/api/institutions/views.py @@ -10,8 +10,11 @@ from framework.auth.oauth_scopes import CoreScopes from osf.models import OSFUser, Node, Institution, Registration -from osf.metrics.reports import InstitutionalUserReport, InstitutionMonthlySummaryReport -from osf.metrics.utils import YearMonth +from osf.metrics.es8_metrics import ( + MonthlyInstitutionalUserReportEs8, + MonthlyInstitutionSummaryReportEs8, +) +from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth from osf.utils import permissions as osf_permissions from api.base import permissions as base_permissions @@ -27,11 +30,6 @@ ) from api.base.exceptions import RelationshipPostMakesNoChanges from api.metrics.permissions import IsInstitutionalMetricsUser -from api.metrics.renderers import ( - MetricsReportsCsvRenderer, - MetricsReportsTsvRenderer, - MetricsReportsJsonRenderer, -) from api.nodes.serializers import NodeSerializer from api.nodes.filters import NodesFilterMixin from api.users.serializers import UserSerializer @@ -411,23 +409,21 @@ class InstitutionDepartmentList(InstitutionMixin, ElasticsearchListView): serializer_class = InstitutionDepartmentMetricsSerializer renderer_classes = ( *api_settings.DEFAULT_RENDERER_CLASSES, - MetricsReportsCsvRenderer, - MetricsReportsTsvRenderer, - MetricsReportsJsonRenderer, + *ElasticsearchListView.FILE_RENDERER_CLASSES, ) pagination_class = JSONAPINoPagination def get_default_search(self): _base_search = ( - InstitutionalUserReport.search() + MonthlyInstitutionalUserReportEs8.search() .filter('term', institution_id=self.get_institution()._id) ) - _yearmonth = InstitutionalUserReport.most_recent_yearmonth(base_search=_base_search) - if _yearmonth is None: + _most_recent_cycle = MonthlyInstitutionalUserReportEs8.most_recent_cycle(_base_search) + if _most_recent_cycle is None: return None _search = ( _base_search - .filter('term', report_yearmonth=str(_yearmonth)) + .filter('term', cycle_coverage=_most_recent_cycle) .exclude('term', user_name='Deleted user') ) # add aggregation on department name @@ -468,9 +464,7 @@ class InstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): view_name = 'institution-user-metrics' renderer_classes = ( *api_settings.DEFAULT_RENDERER_CLASSES, - MetricsReportsCsvRenderer, - MetricsReportsTsvRenderer, - MetricsReportsJsonRenderer, + *ElasticsearchListView.FILE_RENDERER_CLASSES, ) serializer_class = InstitutionUserMetricsSerializer @@ -492,17 +486,16 @@ class InstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): )) def get_default_search(self): - base_search = InstitutionalUserReport.search().filter( - 'term', - institution_id=self.get_institution()._id, + _base_search = ( + MonthlyInstitutionalUserReportEs8.search() + .filter('term', institution_id=self.get_institution()._id) ) - yearmonth = InstitutionalUserReport.most_recent_yearmonth(base_search=base_search) - if yearmonth is None: + _most_recent_cycle = MonthlyInstitutionalUserReportEs8.most_recent_cycle(_base_search) + if _most_recent_cycle is None: return None - return ( - base_search - .filter('term', report_yearmonth=str(yearmonth)) + _base_search + .filter('term', cycle_coverage=_most_recent_cycle) .exclude('term', user_name='Deleted user') ) @@ -525,29 +518,33 @@ class InstitutionSummaryMetricsDetail(JSONAPIBaseView, generics.RetrieveAPIView, serializer_class = InstitutionSummaryMetricsSerializer def get_object(self): - institution = self.get_institution() - search_object = self.get_default_search() - if search_object: - object = search_object.execute()[0] - object.id = institution._id - return object + _institution = self.get_institution() + _search = self.get_default_search() + if _search: + _response = _search[0].execute() + if _response: + _report = _response[0] + _report.id = _institution._id + return _report + return None def get_default_search(self): - base_search = InstitutionMonthlySummaryReport.search().filter( - 'term', - institution_id=self.get_institution()._id, + _base_search = ( + MonthlyInstitutionSummaryReportEs8.search() + .filter('term', institution_id=self.get_institution()._id) ) - yearmonth = InstitutionMonthlySummaryReport.most_recent_yearmonth(base_search=base_search) - if report_date_str := self.request.query_params.get('report_yearmonth'): + _cycle_coverage = None + if _yearmonth_str := self.request.query_params.get('report_yearmonth'): try: - yearmonth = YearMonth.from_str(report_date_str) + _yearmonth = YearMonth.from_str(_yearmonth_str) except ValueError: - pass - - if yearmonth is None: + raise exceptions.ValidationError( + 'report_yearmonth query param must be in YYYY-MM format', + ) + else: + _cycle_coverage = cycle_coverage_yearmonth(_yearmonth) + else: + _cycle_coverage = MonthlyInstitutionSummaryReportEs8.most_recent_cycle(_base_search) + if _cycle_coverage is None: return None - - return base_search.filter( - 'term', - report_yearmonth=str(yearmonth), - ) + return _base_search.filter('term', cycle_coverage=_cycle_coverage) diff --git a/api/metrics/serializers.py b/api/metrics/serializers.py index 9e3f61f5b50..7c1fb9223f7 100644 --- a/api/metrics/serializers.py +++ b/api/metrics/serializers.py @@ -1,26 +1,12 @@ import logging -import datetime from rest_framework import serializers as ser -from api.base.serializers import BaseAPISerializer from api.base.utils import absolute_reverse -from osf.metrics.counted_usage import CountedAuthUsage, PageviewInfo -from osf.metrics.es8_metrics import ( - OsfCountedUsageEvent, - PageviewInfo as PageviewInfoEs8, -) -from website import settings as website_settings +from osf.metrics.es8_metrics import OsfCountedUsageEvent -logger = logging.getLogger(__name__) - - -class PreprintMetricSerializer(BaseAPISerializer): - - query = ser.DictField() - class Meta: - type_ = 'preprint_metrics' +logger = logging.getLogger(__name__) class RawMetricsSerializer(): @@ -30,9 +16,9 @@ class RawMetricsSerializer(): def validate_action_label(label): try: - CountedAuthUsage.ActionLabel(label) + OsfCountedUsageEvent.ActionLabel(label) except ValueError: - valid_labels = ', '.join(label.value for label in CountedAuthUsage.ActionLabel) + valid_labels = ', '.join(label.value for label in OsfCountedUsageEvent.ActionLabel) raise ser.ValidationError( f'Invalid value in action_labels! Valid labels: {valid_labels}', ) @@ -67,31 +53,17 @@ def validate(self, data): return data def create(self, validated_data): - pageview_info = None - pageview_info_es8 = None - if pageview_info_data := validated_data.get('pageview_info'): - pageview_info = PageviewInfo(**pageview_info_data) - pageview_info_es8 = PageviewInfoEs8(**pageview_info_data) - OsfCountedUsageEvent.record( + return OsfCountedUsageEvent.record( item_osfid=validated_data['item_guid'], action_labels=validated_data.get('action_labels'), provider_id=validated_data.get('provider_id'), - pageview_info=pageview_info_es8, + pageview_info=validated_data.get('pageview_info'), # used to create a COUNTER session-hour id, not stored: client_session_id=validated_data.get('client_session_id'), user_id=self.context.get('user_id'), request_host=self.context.get('request_host'), request_useragent=self.context.get('request_useragent'), ) - return CountedAuthUsage.record( - platform_iri=website_settings.DOMAIN, - provider_id=validated_data.get('provider_id'), - item_guid=validated_data.get('item_guid'), - session_id=validated_data['session_id'], # must be provided by the view - user_is_authenticated=validated_data['user_is_authenticated'], # must be provided by the view - action_labels=validated_data.get('action_labels'), - pageview_info=pageview_info, - ) class ReportNameSerializer(ser.BaseSerializer): @@ -109,44 +81,16 @@ def to_representation(self, instance): } -class DailyReportSerializer(ser.BaseSerializer): - def to_representation(self, instance): - # TODO: detangle datamodel (osf.metrics.reports) from api serialization - # (don't use `to_dict` here) - report_as_dict = instance.to_dict() - report_name = self.context['report_name'] - report_date = report_as_dict['report_date'] - - if isinstance(report_date, datetime.datetime): - report_date = report_date.date() - if isinstance(report_date, datetime.date): - report_date = str(report_date) - - return { - 'id': instance.meta.id, - 'type': f'daily-report:{report_name}', - 'attributes': { - **report_as_dict, - 'report_date': report_date, - }, - } - - -class MonthlyReportSerializer(ser.BaseSerializer): +class CyclicReportSerializer(ser.BaseSerializer): def to_representation(self, instance): # TODO: detangle datamodel (osf.metrics.reports) from api serialization # (don't use `to_dict` here) report_as_dict = instance.to_dict() report_name = self.context['report_name'] - report_yearmonth = report_as_dict['report_yearmonth'] - return { 'id': instance.meta.id, - 'type': f'monthly-report:{report_name}', - 'attributes': { - **report_as_dict, - 'report_month': report_yearmonth, - }, + 'type': f'cyclic-report:{report_name}', + 'attributes': report_as_dict, } @@ -158,28 +102,28 @@ def to_representation(self, instance): 'path': bucket['key'], 'route': bucket['route-for-path'].buckets[0]['key'], 'title': bucket['title-for-path'].buckets[0]['key'], - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['popular-pages'].buckets ] unique_visits = [ { 'date': bucket['key'].date(), - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['unique-visits'].buckets ] time_of_day = [ { 'hour': bucket['key'], - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['time-of-day'].buckets ] referer_domain = [ { 'referer_domain': bucket['key'], - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['referer-domain'].buckets ] diff --git a/api/metrics/urls.py b/api/metrics/urls.py index db63df3dd4c..83c6362e517 100644 --- a/api/metrics/urls.py +++ b/api/metrics/urls.py @@ -8,8 +8,6 @@ re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'djelme_backend_name': 'osfmetrics_es6'}), path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'url_path': ''}), path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name), - re_path(r'^preprints/views/$', views.PreprintViewMetrics.as_view(), name=views.PreprintViewMetrics.view_name), - re_path(r'^preprints/downloads/$', views.PreprintDownloadMetrics.as_view(), name=views.PreprintDownloadMetrics.view_name), re_path(r'^registries_moderation/transitions/$', views.RegistriesModerationMetricsView.as_view(), name=views.RegistriesModerationMetricsView.view_name), re_path( @@ -17,8 +15,13 @@ views.ReportNameList.as_view(), name=views.ReportNameList.view_name, ), - re_path( - r'^reports/(?P[a-z0-9_]+)/recent/$', + path( + 'reports//', + views.ReportList.as_view(), + name=views.ReportList.view_name, + ), + path( + 'reports//recent/', views.RecentReportList.as_view(), name=views.RecentReportList.view_name, ), diff --git a/api/metrics/views.py b/api/metrics/views.py index bd53bee296e..772de079995 100644 --- a/api/metrics/views.py +++ b/api/metrics/views.py @@ -4,10 +4,7 @@ from enum import Enum from django.http import JsonResponse, HttpResponse, Http404 -from django.utils import timezone -from elasticsearch6.exceptions import NotFoundError, RequestError -from elasticsearch6_dsl.connections import get_connection from elasticsearch_metrics.registry import djelme_registry from framework.auth.oauth_scopes import CoreScopes @@ -18,11 +15,11 @@ from rest_framework.generics import GenericAPIView from rest_framework.settings import api_settings as drf_api_settings +from api.base.elasticsearch_dsl_views import ElasticsearchListView from api.base.views import JSONAPIBaseView from api.base.permissions import TokenHasScope from api.base.waffle_decorators import require_switch from api.metrics.permissions import ( - IsPreprintMetricsUser, IsRawMetricsUser, IsRegistriesModerationMetricsUser, ) @@ -31,10 +28,8 @@ MetricsReportsTsvRenderer, ) from api.metrics.serializers import ( - PreprintMetricSerializer, RawMetricsSerializer, - DailyReportSerializer, - MonthlyReportSerializer, + CyclicReportSerializer, ReportNameSerializer, NodeAnalyticsSerializer, UserVisitsSerializer, @@ -42,169 +37,47 @@ CountedAuthUsageSerializer, ) from api.metrics.utils import ( - parse_datetimes, parse_date_range, should_skip_counted_usage, ) from api.nodes.permissions import MustBePublic from osf.features import ENABLE_RAW_METRICS -from osf.metrics import ( - utils, - reports, - PreprintDownload, - PreprintView, - RegistriesModerationMetrics, - CountedAuthUsage, +from osf.metrics.es8_metrics import ( + BaseDailyReport, + BaseMonthlyReport, + OsfCountedUsageEvent, + RegistriesModerationEventEs8, + DailyDownloadCountReportEs8, + DailyInstitutionSummaryReportEs8, + DailyNodeSummaryReportEs8, + DailyOsfstorageFileCountReportEs8, + DailyPreprintSummaryReportEs8, + DailyStorageAddonUsageReportEs8, + DailyUserSummaryReportEs8, + DailyNewUserDomainReportEs8, + MonthlySpamSummaryReportEs8, ) from osf.metrics.openapi import get_metrics_openapi_json_dict from osf.models import AbstractNode +from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates logger = logging.getLogger(__name__) -class PreprintMetricMixin(JSONAPIBaseView): - permission_classes = ( - drf_permissions.IsAuthenticated, - drf_permissions.IsAdminUser, - IsPreprintMetricsUser, - TokenHasScope, - ) - - required_read_scopes = [CoreScopes.METRICS_BASIC] - required_write_scopes = [CoreScopes.METRICS_RESTRICTED] - - serializer_class = PreprintMetricSerializer - - @property - def metric_type(self): - raise NotImplementedError - - @property - def metric(self): - raise NotImplementedError - - def add_search(self, search, query_params, **kwargs): - """ - get list of guids from the kwargs - use that in a query to narrow down metrics results - """ - preprint_guid_string = query_params.get('guids') - if not preprint_guid_string: - raise ValidationError( - 'To gather metrics for preprints, you must provide one or more preprint ' + - 'guids in the `guids` query parameter.', - ) - preprint_guids = preprint_guid_string.split(',') - - return search.filter('terms', preprint_id=preprint_guids) - - def format_response(self, response, query_params): - data = [] - if getattr(response, 'aggregations') and response.aggregations: - for result in response.aggregations.dates.buckets: - guid_results = {} - for preprint_result in result.preprints.buckets: - guid_results[preprint_result['key']] = preprint_result['total']['value'] - # return 0 for the guids with no results for consistent payloads - guids = query_params['guids'].split(',') - if guid_results.keys() != guids: - for guid in guids: - if not guid_results.get(guid): - guid_results[guid] = 0 - result_dict = {result.key_as_string: guid_results} - data.append(result_dict) - - return { - 'metric_type': self.metric_type, - 'data': data, - } - - def execute_search(self, search, query=None): - try: - # There's a bug in the ES python library the prevents us from updating the search object, so lets just make - # the raw query. If we have it. - if query: - es = get_connection(search._using) - response = search._response_class( - search, - es.search( - index=search._index, - body=query, - ), - ) - else: - response = search.execute() - except NotFoundError: - # _get_relevant_indices returned 1 or more indices - # that doesn't exist. Fall back to unoptimized query - search = search.index().index(self.metric._default_index()) - response = search.execute() - return response - - def get(self, *args, **kwargs): - query_params = getattr(self.request, 'query_params', self.request.GET) - - interval = query_params.get('interval', 'day') - - start_datetime, end_datetime = parse_datetimes(query_params) - - search = self.metric.search(after=start_datetime) - search = search.filter('range', timestamp={'gte': start_datetime, 'lt': end_datetime}) - search.aggs.bucket('dates', 'date_histogram', field='timestamp', interval=interval) \ - .bucket('preprints', 'terms', field='preprint_id') \ - .metric('total', 'sum', field='count') - search = self.add_search(search, query_params, **kwargs) - response = self.execute_search(search) - resp_dict = self.format_response(response, query_params) - - return JsonResponse(resp_dict) - - def post(self, request, *args, **kwargs): - """ - For a bit of future proofing, accept custom elasticsearch aggregation queries in JSON form. - Caution - this could be slow if a very large query is executed, so use with care! - """ - search = self.metric.search() - query = request.data.get('query') - - try: - results = self.execute_search(search, query) - except RequestError as e: - if e.args: - raise ValidationError(e.info['error']['root_cause'][0]['reason']) - raise ValidationError('Malformed elasticsearch query.') - - return JsonResponse(results.to_dict()) - - -class PreprintViewMetrics(PreprintMetricMixin): - - view_category = 'preprint-metrics' - view_name = 'preprint-view-metrics' - - @property - def metric_type(self): - return 'views' - - @property - def metric(self): - return PreprintView - - -class PreprintDownloadMetrics(PreprintMetricMixin): - - view_category = 'preprint-metrics' - view_name = 'preprint-download-metrics' - - @property - def metric_type(self): - return 'downloads' +VIEWABLE_REPORTS = { + 'download_count': DailyDownloadCountReportEs8, + 'institution_summary': DailyInstitutionSummaryReportEs8, + 'node_summary': DailyNodeSummaryReportEs8, + 'osfstorage_file_count': DailyOsfstorageFileCountReportEs8, + 'preprint_summary': DailyPreprintSummaryReportEs8, + 'storage_addon_usage': DailyStorageAddonUsageReportEs8, + 'user_summary': DailyUserSummaryReportEs8, + 'spam_summary': MonthlySpamSummaryReportEs8, + 'new_user_domains': DailyNewUserDomainReportEs8, +} - @property - def metric(self): - return PreprintDownload class RawMetricsView(GenericAPIView): @@ -287,21 +160,85 @@ class RegistriesModerationMetricsView(GenericAPIView): view_name = 'raw-metrics-view' def get(self, request, *args, **kwargs): - return JsonResponse(RegistriesModerationMetrics.get_registries_info()) + _search = RegistriesModerationEventEs8.search().update_from_dict(self._build_es_query()) + _search_response = _search.execute() + _providers_agg_json = ( + _search_response.aggregations['providers'].to_dict() + if _search_response.aggregations + else {} + ) + return JsonResponse(_providers_agg_json) + + def _build_es_query(self): + _submit_trigger = RegistrationModerationTriggers.SUBMIT.db_name + _reject_trigger = RegistrationModerationTriggers.REJECT_SUBMISSION.db_name + _accept_withdrawal_trigger = RegistrationModerationTriggers.ACCEPT_WITHDRAWAL.db_name + _accepted_state = RegistrationModerationStates.ACCEPTED.db_name + _embargo_state = RegistrationModerationStates.EMBARGO.db_name + _rejected_state = RegistrationModerationStates.REJECTED.db_name + _withdrawn_state = RegistrationModerationStates.WITHDRAWN.db_name + return { + 'aggs': { + 'providers': { + 'terms': {'field': 'provider_id'}, + 'aggs': { + 'transitions_without_comments': { + 'missing': {'field': 'comment'}, + }, + 'transitions_with_comments': { + 'filter': {'exists': {'field': 'comment'}}, + }, + 'submissions': { + 'filter': {'term': {'trigger': _submit_trigger}}, + }, + 'accepted_with_embargo': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _embargo_state}}, + {'term': {'trigger': _submit_trigger}}, + ], + }, + }, + }, + 'accepted_without_embargo': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _accepted_state}}, + {'term': {'trigger': _submit_trigger}}, + ], + }, + }, + }, + 'rejected': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _rejected_state}}, + {'term': {'trigger': _reject_trigger}}, + ], + }, + }, + }, + 'withdrawn': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _withdrawn_state}}, + {'term': {'trigger': _accept_withdrawal_trigger}}, + ], + }, + }, + }, + }, + }, + }, + } -VIEWABLE_REPORTS = { - 'download_count': reports.DownloadCountReport, - 'institution_summary': reports.InstitutionSummaryReport, - 'node_summary': reports.NodeSummaryReport, - 'osfstorage_file_count': reports.OsfstorageFileCountReport, - 'preprint_summary': reports.PreprintSummaryReport, - 'storage_addon_usage': reports.StorageAddonUsage, - 'user_summary': reports.UserSummaryReport, - 'spam_summary': reports.SpamSummaryReport, - 'new_user_domains': reports.NewUserDomainReport, -} - +### +# reports class ReportNameList(JSONAPIBaseView): permission_classes = ( @@ -325,6 +262,51 @@ def get(self, request, *args, **kwargs): return Response({'data': serializer.data}) +class ReportList(ElasticsearchListView): + view_category = 'metrics' + view_name = 'report-list' + + permission_classes = ( + TokenHasScope, + drf_permissions.IsAuthenticatedOrReadOnly, + ) + + required_read_scopes = [CoreScopes.ALWAYS_PUBLIC] + required_write_scopes = [CoreScopes.NULL] + + serializer_class = CyclicReportSerializer + renderer_classes = ( + *drf_api_settings.DEFAULT_RENDERER_CLASSES, + *ElasticsearchListView.FILE_RENDERER_CLASSES, + ) + + default_ordering = '-cycle_coverage' + ordering_fields = frozenset(( + 'cycle_coverage', + )) + + def get_default_search(self): + _report_name = self.kwargs['report_name'] + try: + _report_cls = VIEWABLE_REPORTS[_report_name] + except KeyError: + return Response( + { + 'errors': [{ + 'title': 'unknown report name', + 'detail': f'unknown report: "{_report_name}"', + }], + }, + status=404, + ) + return _report_cls.search() + + def get_serializer_context(self): + return { + **super().get_serializer_context(), + 'report_name': self.kwargs['report_name'], + } + class RecentReportList(JSONAPIBaseView): MAX_COUNT = 10000 DEFAULT_DAYS_BACK = 13 @@ -340,7 +322,7 @@ class RecentReportList(JSONAPIBaseView): view_category = 'metrics' view_name = 'recent-report-list' - serializer_class = DailyReportSerializer + serializer_class = CyclicReportSerializer renderer_classes = ( *drf_api_settings.DEFAULT_RENDERER_CLASSES, MetricsReportsCsvRenderer, @@ -360,23 +342,15 @@ def get(self, request, *args, report_name): }, status=404, ) - is_daily = issubclass(report_class, reports.DailyReport) + is_daily = issubclass(report_class, BaseDailyReport) days_back = request.GET.get('days_back', self.DEFAULT_DAYS_BACK if is_daily else None) - is_monthly = issubclass(report_class, reports.MonthlyReport) - - if is_daily: - serializer_class = DailyReportSerializer - range_field_name = 'report_date' - elif is_monthly: - serializer_class = MonthlyReportSerializer - range_field_name = 'report_yearmonth' - else: - raise ValueError(f'report class must subclass DailyReport or MonthlyReport: {report_class}') + is_monthly = issubclass(report_class, BaseMonthlyReport) + range_filter = parse_date_range(request.GET, is_monthly=is_monthly) search_recent = ( report_class.search() - .filter('range', **{range_field_name: range_filter}) - .sort(range_field_name) + .filter('range', cycle_coverage=range_filter) + .sort('-cycle_coverage') [:self.MAX_COUNT] ) if days_back: @@ -384,7 +358,7 @@ def get(self, request, *args, report_name): report_date_range = parse_date_range(request.GET) search_response = search_recent.execute() - serializer = serializer_class( + serializer = self.serializer_class( search_response, many=True, context={'report_name': report_name}, @@ -428,46 +402,9 @@ def post(self, request, *args, **kwargs): pageview_info=serializer.validated_data.get('pageview_info'), ): return HttpResponse(status=204) - session_id, user_is_authenticated = self._get_session_id( - request, - client_session_id=serializer.validated_data.get('client_session_id'), - ) - serializer.save(session_id=session_id, user_is_authenticated=user_is_authenticated) + serializer.save() return HttpResponse(status=201) - def _get_session_id(self, request, client_session_id=None): - # NOTE: to remove after osfmetrics 6to8 migration -- logic moved to djelme - - # get a session id as described in the COUNTER code of practice: - # https://cop5.projectcounter.org/en/5.0.2/07-processing/03-counting-unique-items.html - # -- different from the "login session" tracked by `osf.models.Session` (which - # lasts about a month), this session lasts at most a day and may time out after - # minutes or hours of inactivity - now = timezone.now() - current_date_str = now.date().isoformat() - - user_is_authenticated = request.user.is_authenticated - if client_session_id: - session_id_parts = [ - client_session_id, - current_date_str, - ] - elif user_is_authenticated: - session_id_parts = [ - request.user._id, - current_date_str, - now.hour, - ] - else: - session_id_parts = [ - request.get_host(), - request.META.get('HTTP_USER_AGENT', ''), - current_date_str, - now.hour, - ] - user_is_authenticated = False - return utils.stable_key(*session_id_parts), user_is_authenticated - class NodeAnalyticsQuery(JSONAPIBaseView): permission_classes = ( @@ -495,7 +432,7 @@ def get(self, request, *args, node_guid, timespan): except AbstractNode.DoesNotExist: raise Http404 self.check_object_permissions(request, node) - analytics_result = self._run_query(node_guid, timespan) + analytics_result = self._run_node_analytics_query(node.get_semantic_iri(), timespan) serializer = self.serializer_class( analytics_result, context={ @@ -505,22 +442,18 @@ def get(self, request, *args, node_guid, timespan): ) return Response({'data': serializer.data}) - def _run_query(self, node_guid, timespan): - query_dict = self._build_query_payload(node_guid, NodeAnalyticsQuery.Timespan(timespan)) - analytics_search = CountedAuthUsage.search().update_from_dict(query_dict) + def _run_node_analytics_query(self, item_iri, timespan): + query_dict = self._build_query_payload(item_iri, NodeAnalyticsQuery.Timespan(timespan)) + analytics_search = OsfCountedUsageEvent.search().update_from_dict(query_dict) return analytics_search.execute() - def _build_query_payload(self, node_guid, timespan): + def _build_query_payload(self, item_iri, timespan): return { 'size': 0, # don't return hits, just the aggregations 'query': { 'bool': { - 'minimum_should_match': 1, - 'should': [ - {'term': {'item_guid': node_guid}}, - {'term': {'surrounding_guids': node_guid}}, - ], 'filter': [ + {'term': {'within_iris': item_iri}}, {'term': {'item_public': True}}, {'term': {'action_labels': 'view'}}, {'term': {'action_labels': 'web'}}, @@ -532,7 +465,12 @@ def _build_query_payload(self, node_guid, timespan): 'unique-visits': { 'date_histogram': { 'field': 'timestamp', - 'interval': 'day', + 'calendar_interval': 'day', + }, + 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, }, }, 'time-of-day': { @@ -540,12 +478,22 @@ def _build_query_payload(self, node_guid, timespan): 'field': 'pageview_info.hour_of_day', 'size': 24, }, + 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, + }, }, 'referer-domain': { 'terms': { 'field': 'pageview_info.referer_domain', 'size': 10, }, + 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, + }, }, 'popular-pages': { 'terms': { @@ -553,6 +501,9 @@ def _build_query_payload(self, node_guid, timespan): 'size': 10, }, 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, 'route-for-path': { 'terms': { 'field': 'pageview_info.route_name', @@ -627,7 +578,7 @@ def get(self, request, *args): pass # just fall back to days_back for now timespan = report_date - analytics_result = self._run_query(timespan) + analytics_result = self._run_user_visits_query(timespan) serializer = self.serializer_class( analytics_result, context={ @@ -636,9 +587,9 @@ def get(self, request, *args): ) return JsonResponse({'data': serializer.data}) - def _run_query(self, timespan): + def _run_user_visits_query(self, timespan): query_dict = self._build_query_payload(timespan) - analytics_search = CountedAuthUsage.search().update_from_dict(query_dict) + analytics_search = OsfCountedUsageEvent.search().update_from_dict(query_dict) return analytics_search.execute() def _build_query_payload(self, timespan): @@ -655,13 +606,11 @@ def _build_query_payload(self, timespan): 'unique-visits': { 'date_histogram': { 'field': 'timestamp', - 'interval': 'day', + 'calendar_interval': 'day', }, 'aggs': { 'user-visits': { - 'cardinality': { - 'field': 'session_id', - }, + 'cardinality': {'field': 'sessionhour_id'}, }, }, }, diff --git a/api/preprints/views.py b/api/preprints/views.py index 7e087aaa858..3d02b8f704a 100644 --- a/api/preprints/views.py +++ b/api/preprints/views.py @@ -71,8 +71,7 @@ from api.requests.serializers import PreprintRequestSerializer, PreprintRequestCreateSerializer from api.requests.views import PreprintRequestMixin from api.subjects.views import BaseResourceSubjectsList, SubjectRelationshipBaseView -from api.base.metrics import PreprintMetricsViewMixin -from osf.metrics import PreprintDownload, PreprintView +from api.base.metrics import UsageMetricsViewMixin class PreprintOldVersionsImmutableMixin: @@ -172,7 +171,7 @@ def get_preprint(self, check_object_permissions=True, ignore_404=False): return preprint -class PreprintList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): +class PreprintList(JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): """See [documentation for this endpoint](https://developer.osf.io/#operation/preprints_list). """ # These permissions are not checked for the list of preprints, permissions handled by the query @@ -194,10 +193,6 @@ class PreprintList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreat ordering_fields = ('created', 'date_last_transitioned') view_category = 'preprints' view_name = 'preprint-list' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } def get_serializer_class(self): if self.request.method == 'POST': @@ -208,38 +203,15 @@ def get_serializer_class(self): def get_default_queryset(self): auth = get_user_auth(self.request) auth_user = getattr(auth, 'user', None) - # Permissions on the list objects are handled by the query - public_only = self.metrics_requested - queryset = self.preprints_queryset(Preprint.objects.all(), auth_user, public_only=public_only) - # Use get_metrics_queryset to return a queryset with annotated metrics - # iff ?metrics query param is present - if self.metrics_requested: - return self.get_metrics_queryset(queryset) - else: - return queryset + return self.preprints_queryset(Preprint.objects.all(), auth_user) # overrides ListAPIView def get_queryset(self): return self.get_queryset_from_request() - # overrides PreprintMetricsViewMixin - def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after): - return metric_class.get_top_by_count( - qs=queryset, - model_field='guids___id', - metric_field='preprint_id', - annotation=metric_name, - after=after, - # Limit the bucket size - # of the ES aggregation. Otherwise, - # the number of buckets == the number of total preprints, - # which is too many for ES to handle - size=200, - ) - -class PreprintVersionsList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): +class PreprintVersionsList(JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): """List existing versions of a preprint or create a new version. GET: Returns a collection of preprint resources representing all versions of the given preprint. @@ -265,10 +237,6 @@ class PreprintVersionsList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.L ordering_fields = ('created', 'date_last_transitioned') view_category = 'preprints' view_name = 'preprint-versions' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } def get_serializer_class(self): if self.request.method == 'POST': @@ -288,8 +256,7 @@ def get_queryset(self): auth_user = getattr(auth, 'user', None) # Permissions on the list objects are handled by the query - public_only = self.metrics_requested - qs = qs.filter(Preprint.objects.preprint_versions_permissions_query(auth_user, public_only=public_only)) + qs = qs.filter(Preprint.objects.preprint_versions_permissions_query(auth_user)) return qs @@ -299,7 +266,7 @@ def create(self, request, *args, **kwargs): return super().create(request, *args, **kwargs) -class PreprintDetail(PreprintOldVersionsImmutableMixin, PreprintMetricsViewMixin, JSONAPIBaseView, generics.RetrieveUpdateDestroyAPIView, PreprintMixin, WaterButlerMixin): +class PreprintDetail(PreprintOldVersionsImmutableMixin, UsageMetricsViewMixin, JSONAPIBaseView, generics.RetrieveUpdateDestroyAPIView, PreprintMixin, WaterButlerMixin): """See [documentation for this endpoint](https://developer.osf.io/#operation/preprints_read). Note: The resource now exposes a `versions` relationship pointing to @@ -324,15 +291,6 @@ class PreprintDetail(PreprintOldVersionsImmutableMixin, PreprintMetricsViewMixin view_category = 'preprints' view_name = 'preprint-detail' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } - - def add_metric_to_object(self, obj, metric_class, metric_name, after): - count = metric_class.get_count_for_preprint(obj, after=after) - setattr(obj, metric_name, count) - return obj def get_object(self): preprint = self.get_preprint() @@ -355,6 +313,7 @@ def delete(self, request, *args, **kwargs): raise ValidationError('You cannot delete created preprint') + class PreprintNodeRelationship(PreprintOldVersionsImmutableMixin, JSONAPIBaseView, generics.RetrieveUpdateAPIView, PreprintMixin): permission_classes = ( drf_permissions.IsAuthenticatedOrReadOnly, diff --git a/api/providers/views.py b/api/providers/views.py index fbfa287d4a7..4a35706bb4d 100644 --- a/api/providers/views.py +++ b/api/providers/views.py @@ -16,7 +16,6 @@ InvalidFilterValue, ) from api.base.filters import ListFilterMixin, PreprintAsTargetFilterMixin, PreprintFilterMixin -from api.base.metrics import PreprintMetricsViewMixin from api.base.pagination import MaxSizePagination, IncreasedPageSizePagination from api.base.settings import BULK_SETTINGS from api.base.utils import get_object_or_error, get_user_auth, is_truthy @@ -61,7 +60,6 @@ from framework.auth.oauth_scopes import CoreScopes from framework.celery_tasks.handlers import enqueue_task from guardian.shortcuts import get_objects_for_user -from osf.metrics import PreprintDownload, PreprintView from osf.models import ( AbstractNode, CollectionProvider, @@ -148,7 +146,7 @@ class RegistrationProviderList(GenericProviderList): view_name = 'registration-providers-list' -class PreprintProviderList(PreprintMetricsViewMixin, GenericProviderList): +class PreprintProviderList(GenericProviderList): """See [documentation for this endpoint](https://developer.osf.io/#operation/preprint_provider_list). """ @@ -156,21 +154,6 @@ class PreprintProviderList(PreprintMetricsViewMixin, GenericProviderList): serializer_class = PreprintProviderSerializer view_category = 'preprint-providers' view_name = 'preprint-providers-list' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } - - # overrides PreprintMetricsViewMixin - def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after): - return metric_class.get_top_by_count( - qs=queryset, - model_field='_id', - metric_field='provider_id', - annotation=metric_name, - after=after, - size=None, - ) def get_renderer_context(self): context = super().get_renderer_context() diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py index 8b785504756..2f359e17bc9 100644 --- a/api_tests/institutions/views/test_institution_department_list.py +++ b/api_tests/institutions/views/test_institution_department_list.py @@ -1,19 +1,25 @@ -import pytest import datetime +import pytest +from elasticsearch_metrics.tests.util import djelme_test_backends + from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE from osf_tests.factories import ( InstitutionFactory, AuthUserFactory, ) -from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8 from osf.metrics.utils import YearMonth -@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionDepartmentList: + @pytest.fixture(autouse=True) + def _real_elastic(self): + with djelme_test_backends(): + yield + @pytest.fixture() def institution(self): return InstitutionFactory() @@ -37,55 +43,55 @@ def user4(self): @pytest.fixture() def populate_counts(self, user, user2, user3, user4, admin, institution): # This represents a Department that had a user, but no longer has any users, so does not appear in results. - InstitutionalUserReport( + MonthlyInstitutionalUserReportEs8( report_yearmonth=YearMonth(2017, 2), user_id=user._id, institution_id=institution._id, department_name='Old Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) _this_month = YearMonth.from_date(datetime.date.today()) # The user has left the department - InstitutionalUserReport( + MonthlyInstitutionalUserReportEs8( report_yearmonth=_this_month, user_id=user._id, institution_id=institution._id, department_name='New Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) # A second user entered the department - InstitutionalUserReport( + MonthlyInstitutionalUserReportEs8( report_yearmonth=_this_month, user_id=user2._id, institution_id=institution._id, department_name='New Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) # A new department with a single user to test sorting - InstitutionalUserReport( + MonthlyInstitutionalUserReportEs8( report_yearmonth=_this_month, user_id=user3._id, institution_id=institution._id, department_name='Smaller Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) # A user with no department - InstitutionalUserReport( + MonthlyInstitutionalUserReportEs8( report_yearmonth=_this_month, user_id=user4._id, institution_id=institution._id, public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) @pytest.fixture() def admin(self, institution): @@ -113,7 +119,7 @@ def test_auth(self, app, url, user, admin): assert resp.json['data'] == [] def test_get(self, app, url, admin, institution, populate_counts): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() resp = app.get(url, auth=admin.auth) assert resp.json['data'] == [{ diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index 6dd6c5bbda3..958fe28165f 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -1,16 +1,21 @@ import pytest +from elasticsearch_metrics.tests.util import djelme_test_backends from api.base.settings.defaults import API_BASE from osf_tests.factories import ( InstitutionFactory, AuthUserFactory, ) -from osf.metrics.reports import InstitutionMonthlySummaryReport +from osf.metrics.es8_metrics import MonthlyInstitutionSummaryReportEs8 -@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionSummaryMetricsList: + @pytest.fixture(autouse=True) + def _real_elastic(self): + with djelme_test_backends(): + yield + @pytest.fixture() def institution(self): return InstitutionFactory() @@ -30,10 +35,10 @@ def unshown_reports(self, institution): # Reports that should not be shown in the results # Report from another institution another_institution = InstitutionFactory() - _summary_report_factory('2024-08', another_institution) + _summary_report_factory('2024-08', another_institution, validate=False) # Old report from the same institution - _summary_report_factory('2024-07', institution) - _summary_report_factory('2018-02', institution) + _summary_report_factory('2024-07', institution, validate=False) + _summary_report_factory('2018-02', institution, validate=False) @pytest.fixture() def reports(self, institution): @@ -84,7 +89,7 @@ def test_get_empty(self, app, url, institutional_admin): assert resp.json['meta'] == {'version': '2.0'} def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReportEs8.refresh() resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -150,7 +155,7 @@ def test_get_report_with_multiple_months_and_institutions( monthly_logged_in_user_count=270, monthly_active_user_count=260, ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReportEs8.refresh() resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -179,19 +184,21 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_ '2024-08', institution, user_count=0, + validate=False, ) _summary_report_factory( '2024-09', institution, user_count=999, - + validate=False, ) _summary_report_factory( '2018-02', institution, user_count=4133, + validate=False, ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReportEs8.refresh() resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -205,39 +212,25 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_ attributes = resp.json['data']['attributes'] assert attributes['user_count'] == 4133 - def test_get_with_invalid_report_date(self, app, url, institution, institutional_admin): - _summary_report_factory( - '2024-08', - institution, - user_count=0, - ) - _summary_report_factory( - '2024-09', - institution, - user_count=999, - ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) - - # Request with an invalid report_date format - resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth) - assert resp.status_code == 200 - - # Verify it defaults to the most recent report data - attributes = resp.json['data']['attributes'] - assert attributes['user_count'] == 999 + def test_get_with_invalid_report_yearmonth(self, app, url, institution, institutional_admin): + # Request with an invalid report_yearmonth format + resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth, expect_errors=True) + assert resp.status_code == 400 def test_get_without_report_date_uses_most_recent(self, app, url, institution, institutional_admin): _summary_report_factory( '2024-08', institution, user_count=0, + validate=False, ) _summary_report_factory( '2024-09', institution, user_count=999, + validate=False, ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReportEs8.refresh() resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -246,11 +239,11 @@ def test_get_without_report_date_uses_most_recent(self, app, url, institution, i assert attributes['user_count'] == 999 -def _summary_report_factory(yearmonth, institution, **kwargs): - report = InstitutionMonthlySummaryReport( +def _summary_report_factory(yearmonth, institution, *, validate=True, **kwargs): + report = MonthlyInstitutionSummaryReportEs8( report_yearmonth=yearmonth, institution_id=institution._id, **kwargs, ) - report.save() + report.save(validate=validate) return report diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index d2b99da435f..c7a93da7726 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -12,7 +12,7 @@ AuthUserFactory, ) -from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8 from osf.models import UserMessage from tests.utils import capture_notifications @@ -89,7 +89,7 @@ def test_get_empty(self, app, url, institutional_admin): assert _resp.json['data'] == [] def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() _resp = app.get(url, auth=institutional_admin.auth) assert _resp.status_code == 200 assert len(_resp.json['data']) == len(reports) @@ -101,7 +101,7 @@ def test_get_reports(self, app, url, institutional_admin, institution, reports, assert len(response_object['attributes']['contacts']) == 0 def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() for _query, _expected_user_ids in ( ({'filter[department]': 'nunavum'}, set()), ({'filter[department]': 'incidentally'}, set()), @@ -137,7 +137,7 @@ def test_filter_reports(self, app, url, institutional_admin, institution, report assert set(_user_ids(_resp)) == _expected_user_ids def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), @@ -147,7 +147,7 @@ def test_sort_reports(self, app, url, institutional_admin, institution, reports, assert list(_user_ids(_resp)) == _expected_user_id_list def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), @@ -182,7 +182,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu month_last_active='2018-02', month_last_login='2018-02', ) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -286,7 +286,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu str(736662999298 + i), f'Jalen Hurts #{i}', ]) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() # Make request for CSV format with page[size]=10 resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) @@ -352,7 +352,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti month_last_active='2018-02', month_last_login='2018-02', ) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -418,7 +418,7 @@ def test_correct_number_of_contact_messages(self, app, url, institutional_admin, department_name='a department, or so, that happens, incidentally, to have commas', storage_byte_count=736662999298, ) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReportEs8.refresh() receiver = user1 with capture_notifications(): @@ -480,10 +480,10 @@ def _user_ids(api_response): yield _datum['relationships']['user']['data']['id'] def _report_factory(yearmonth, institution, **kwargs): - _report = InstitutionalUserReport( + _report = MonthlyInstitutionalUserReportEs8( report_yearmonth=yearmonth, institution_id=institution._id, **kwargs, ) - _report.save() + _report.save(validate=False) return _report diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py deleted file mode 100644 index 016677c3a11..00000000000 --- a/api_tests/metrics/test_composite_query.py +++ /dev/null @@ -1,86 +0,0 @@ -import pytest -from datetime import datetime -from osf_tests.factories import ( - PreprintFactory, - AuthUserFactory -) - -from osf.metrics import PreprintDownload -from api.base.settings import API_PRIVATE_BASE as API_BASE - - -@pytest.fixture() -def preprint(): - return PreprintFactory() - - -@pytest.fixture() -def user(): - user = AuthUserFactory() - user.is_staff = True - user.add_system_tag('preprint_metrics') - user.save() - return user - - -@pytest.fixture -def base_url(): - return f'/{API_BASE}metrics/preprints/' - - -@pytest.mark.es_metrics -@pytest.mark.django_db -class TestElasticSearch: - - def test_elasticsearch_agg_query(self, app, user, base_url, preprint): - post_url = f'{base_url}downloads/' - - payload = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': { - 'aggs': { - 'preprints_by_year': { - 'composite': { - 'sources': [{ - 'date': { - 'date_histogram': { - 'field': 'timestamp', - 'interval': 'year' - } - } - }] - } - } - } - } - } - } - } - - resp = app.post_json_api(post_url, payload, auth=user.auth) - - assert resp.status_code == 200 - assert resp.json['hits']['hits'] == [] - - PreprintDownload.record_for_preprint( - preprint, - path=preprint.primary_file.path, - timestamp=datetime(year=2020, month=1, day=1), - ) - PreprintDownload.record_for_preprint( - preprint, - path=preprint.primary_file.path, - timestamp=datetime(year=2020, month=2, day=1) - ) - PreprintDownload._get_connection().indices.refresh(PreprintDownload._template_pattern) - - resp = app.post_json_api(post_url, payload, auth=user.auth) - assert resp.status_code == 200 - assert len(resp.json['aggregations']['preprints_by_year']['buckets']) == 1 - - payload['data']['attributes']['query']['aggs']['preprints_by_year']['composite']['sources'][0]['date']['date_histogram']['interval'] = 'month' - - resp = app.post_json_api(post_url, payload, auth=user.auth) - assert len(resp.json['aggregations']['preprints_by_year']['buckets']) == 2 diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py index e954248c15b..3860716c6c3 100644 --- a/api_tests/metrics/test_counted_usage.py +++ b/api_tests/metrics/test_counted_usage.py @@ -1,10 +1,14 @@ from datetime import datetime, timezone +from unittest import mock import pytest -from unittest import mock +from elasticsearch_metrics.util.anon_enough import opaque_key from framework.auth.core import Auth +from osf.metadata.rdfutils import OSF +from osf.utils.permissions import ADMIN, READ, WRITE +from api_tests.utils import create_test_file from osf_tests.factories import ( AuthUserFactory, NodeFactory, @@ -12,11 +16,7 @@ PrivateLinkFactory, ProjectFactory, RegistrationFactory, - # UserFactory, ) -from osf.utils.permissions import ADMIN, READ, WRITE -from api_tests.utils import create_test_file -from elasticsearch_metrics.tests.util import djelme_test_backends COUNTED_USAGE_URL = '/_/metrics/events/counted_usage/' @@ -30,23 +30,24 @@ def counted_usage_payload(**attributes): } -def assert_saved_with(mock_save, *, expected_doc_id=None, expected_attrs): - assert mock_save.call_count == 1 - args, kwargs = mock_save.call_args - actual_instance = args[0] +def assert_saved_with(mock_es8, *, expected_doc_id=None, expected_attrs): + assert mock_es8.index.call_count == 1 + _args, _kwargs = mock_es8.index.call_args if expected_doc_id is not None: - assert actual_instance.meta.id == expected_doc_id - actual_attrs = actual_instance.to_dict() - for attr_name, expected_value in expected_attrs.items(): - actual_value = actual_attrs.get(attr_name, None) - assert actual_value == expected_value, repr(actual_value) + assert _kwargs['id'] == expected_doc_id + _actual_attrs = _kwargs['body'] + for _attr_name, _expected_value in expected_attrs.items(): + _actual_value = _actual_attrs.get(_attr_name, None) + assert (_actual_value == _expected_value), repr(_actual_value) @pytest.fixture -def mock_save(): - with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save +def mock_es8(): + with mock.patch('elasticsearch_metrics.imps.elastic8.TimeseriesRecord.check_djelme_setup'): + with mock.patch('elasticsearch_metrics.imps.elastic8.BaseDjelmeRecord._get_connection') as _mock_get_connection: + _mock_es8 = _mock_get_connection.return_value + _mock_es8.index.return_value = {'result': {}} + yield _mock_es8 @pytest.mark.django_db @@ -76,21 +77,19 @@ def test_required_attributes(self, app, attrs): @pytest.mark.django_db class TestComputedFields: - @pytest.fixture(autouse=True) - def _real_elastic(self): - with djelme_test_backends(): - yield - @pytest.fixture(autouse=True) def mock_domain(self): domain = 'http://example.foo/' - with mock.patch('api.metrics.serializers.website_settings.DOMAIN', new=domain): + with mock.patch('website.settings.DOMAIN', new=domain): yield domain @pytest.fixture(autouse=True) def mock_now(self): timestamp = datetime(1981, 1, 1, 0, 1, 31, tzinfo=timezone.utc) - with mock.patch('django.utils.timezone.now', return_value=timestamp): + with ( + mock.patch('django.utils.timezone.now', return_value=timestamp), + mock.patch('elasticsearch_metrics.imps.elastic8.utcnow', return_value=timestamp), + ): yield timestamp @pytest.fixture @@ -105,7 +104,7 @@ def user(self): with mock.patch('osf.models.base.generate_guid', return_value='guidy'): return AuthUserFactory() - def test_by_client_session_id(self, app, mock_save, user, preprint): + def test_by_client_session_id(self, app, mock_es8, user, preprint): payload = counted_usage_payload( client_session_id='hello', item_guid=preprint._id, @@ -115,18 +114,25 @@ def test_by_client_session_id(self, app, mock_save, user, preprint): headers = { 'User-Agent': 'haha', } - resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers, auth=user.auth) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['hello', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest() - expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['api', 'view']", + 'http://example.foo/blahblah/blee', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'hello|1981-01-01').hexdigest() - 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34', - 'action_labels': ['view', 'api'], + 'item_osfid': preprint._id, + 'item_type': str(OSF.Preprint), + 'sessionhour_id': _expected_sessionhour_id, + 'action_labels': ['api', 'view'], 'pageview_info': { 'page_url': 'http://example.foo/blahblah/blee', 'page_path': '/blahblah/blee', @@ -135,9 +141,9 @@ def test_by_client_session_id(self, app, mock_save, user, preprint): }, ) - def test_by_client_session_id_anon(self, app, mock_save, preprint): + def test_by_client_session_id_anon(self, app, mock_es8, preprint): payload = counted_usage_payload( - client_session_id='hello', + client_session_id='hihi', item_guid=preprint._id, action_labels=['view', 'web'], pageview_info={ @@ -150,15 +156,22 @@ def test_by_client_session_id_anon(self, app, mock_save, preprint): } resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['hihi', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest() - expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['view', 'web']", + 'http://example.foo/bliz/', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'hello|1981-01-01').hexdigest() - 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34', + 'item_osfid': preprint._id, + 'item_type': str(OSF.Preprint), + 'sessionhour_id': _expected_sessionhour_id, 'action_labels': ['view', 'web'], 'pageview_info': { 'page_url': 'http://example.foo/bliz/', @@ -170,7 +183,7 @@ def test_by_client_session_id_anon(self, app, mock_save, preprint): }, ) - def test_by_user_auth(self, app, mock_save, user, preprint): + def test_by_user_auth(self, app, mock_es8, user, preprint): payload = counted_usage_payload( item_guid=preprint._id, action_labels=['view', 'web'], @@ -184,15 +197,22 @@ def test_by_user_auth(self, app, mock_save, user, preprint): } resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers, auth=user.auth) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['guidy', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest() - expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['view', 'web']", + 'http://osf.io/mst3k', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'guidy|1981-01-01|0').hexdigest() - 'session_id': 'ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a', + 'item_osfid': preprint._id, + 'item_type': str(OSF.Preprint), + 'sessionhour_id': _expected_sessionhour_id, 'action_labels': ['view', 'web'], 'pageview_info': { 'page_url': 'http://osf.io/mst3k', @@ -204,7 +224,7 @@ def test_by_user_auth(self, app, mock_save, user, preprint): }, ) - def test_by_useragent_header(self, app, mock_save, preprint): + def test_by_useragent_header(self, app, mock_es8, preprint): payload = counted_usage_payload( item_guid=preprint._id, action_labels=['view', 'api'], @@ -218,16 +238,23 @@ def test_by_useragent_header(self, app, mock_save, preprint): } resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['localhost:80', 'haha', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest() - expected_doc_id='6d7549df6734bb955eb832c6316ffae46c2959c95b5817ab4fcb341dbc875c23', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['api', 'view']", + 'http://example.foo/bliz/', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'localhost:80|haha|1981-01-01|0').hexdigest() - 'session_id': '97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a', - 'action_labels': ['view', 'api'], + 'item_osfid': preprint._id, + 'item_type': str(OSF.Preprint), + 'sessionhour_id': opaque_key(['localhost:80', 'haha', '1981-01-01', '0']), + 'action_labels': ['api', 'view'], 'pageview_info': { 'page_url': 'http://example.foo/bliz/', 'page_path': '/bliz', @@ -244,9 +271,10 @@ def test_by_useragent_header(self, app, mock_save, preprint): class TestGuidFields: @pytest.fixture(autouse=True) - def _real_elastic(self): - with djelme_test_backends(): - yield + def mock_domain(self): + domain = 'http://example.foo/' + with mock.patch('website.settings.DOMAIN', new=domain): + yield domain @pytest.fixture def preprint(self, item_public): @@ -286,7 +314,7 @@ def child_reg_file(self, child_reg): def child_reg_file_guid(self, child_reg_file): return child_reg_file.get_guid(create=True)._id - def test_preprint_file(self, app, mock_save, preprint, item_public): + def test_preprint_file(self, app, mock_es8, preprint, item_public, mock_domain): # test_preprint_guid payload = counted_usage_payload( item_guid=preprint._id, @@ -295,16 +323,18 @@ def test_preprint_file(self, app, mock_save, preprint, item_public): resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ - 'item_guid': preprint._id, - 'item_type': 'preprint', + 'item_osfid': preprint._id, + 'item_iri': f'{mock_domain}{preprint._id}', + 'item_type': str(OSF.Preprint), 'item_public': item_public, 'provider_id': preprint.provider._id, - 'surrounding_guids': None, + 'database_iri': f'{mock_domain}preprints/{preprint.provider._id}', + 'within_iris': [f'{mock_domain}{preprint._id}'], }, ) - mock_save.reset_mock() + mock_es8.reset_mock() # test_preprint_file_guid payload = counted_usage_payload( @@ -314,17 +344,22 @@ def test_preprint_file(self, app, mock_save, preprint, item_public): resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ - 'item_guid': preprint.primary_file.get_guid()._id, - 'item_type': 'osfstoragefile', + 'item_osfid': preprint.primary_file.get_guid()._id, + 'item_iri': preprint.primary_file.get_semantic_iri(), + 'item_type': str(OSF.File), 'item_public': item_public, 'provider_id': preprint.primary_file.provider, - 'surrounding_guids': [preprint._id], + 'database_iri': f'urn:files.osf.io:{preprint.primary_file.provider}', + 'within_iris': sorted([ + f'{mock_domain}{preprint._id}', + preprint.primary_file.get_semantic_iri(), + ]), }, ) - def test_child_registration_file(self, app, mock_save, child_reg_file_guid, child_reg, parent_reg, item_public): + def test_child_registration_file(self, app, mock_es8, child_reg_file_guid, child_reg_file, child_reg, parent_reg, item_public, mock_domain): # test_child_registration_file_guid payload = counted_usage_payload( item_guid=child_reg_file_guid, @@ -333,20 +368,22 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ 'action_labels': ['view', 'web'], - 'item_guid': child_reg_file_guid, - 'item_type': 'osfstoragefile', + 'item_osfid': child_reg_file_guid, + 'item_type': str(OSF.File), 'item_public': item_public, 'provider_id': 'osfstorage', - 'surrounding_guids': [ - child_reg._id, - parent_reg._id, - ], + 'database_iri': 'urn:files.osf.io:osfstorage', + 'within_iris': sorted([ + child_reg_file.get_semantic_iri(), + child_reg.get_semantic_iri(), + parent_reg.get_semantic_iri(), + ]), }, ) - mock_save.reset_mock() + mock_es8.reset_mock() # test_child_registration_guid payload = counted_usage_payload( @@ -356,19 +393,22 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ 'action_labels': ['view', 'web'], - 'item_guid': child_reg._id, - 'item_type': 'registration', + 'item_osfid': child_reg._id, + 'item_type': str(OSF.RegistrationComponent), 'item_public': item_public, 'provider_id': 'osf', - 'surrounding_guids': [ - parent_reg._id, - ], + 'database_iri': f'{mock_domain}registries/osf', + 'item_iri': child_reg.get_semantic_iri(), + 'within_iris': sorted([ + child_reg.get_semantic_iri(), + parent_reg.get_semantic_iri(), + ]), }, ) - mock_save.reset_mock() + mock_es8.reset_mock() # test_parent_registration_guid payload = counted_usage_payload( @@ -378,13 +418,15 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ 'action_labels': ['view', 'web'], - 'item_guid': parent_reg._id, + 'item_osfid': parent_reg._id, 'item_public': item_public, 'provider_id': 'osf', - 'surrounding_guids': None, + 'database_iri': f'{mock_domain}registries/osf', + 'item_iri': parent_reg.get_semantic_iri(), + 'within_iris': [parent_reg.get_semantic_iri()], }, ) @@ -392,7 +434,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil @pytest.mark.django_db class TestContributorExclusion: - def test_creator_pageview_not_recorded(self, app, mock_save): + def test_creator_pageview_not_recorded(self, app, mock_es8): user = AuthUserFactory() project = ProjectFactory(creator=user) payload = counted_usage_payload( @@ -402,14 +444,14 @@ def test_creator_pageview_not_recorded(self, app, mock_save): ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=user.auth) assert resp.status_code == 204 - assert mock_save.call_count == 0 + assert mock_es8.index.call_count == 0 @pytest.mark.parametrize( 'permissions', [READ, WRITE, ADMIN], ids=['read', 'write', 'admin'], ) - def test_contributor_pageview_not_recorded(self, app, mock_save, permissions): + def test_contributor_pageview_not_recorded(self, app, mock_es8, permissions): creator = AuthUserFactory() contributor = AuthUserFactory() project = ProjectFactory(creator=creator) @@ -421,9 +463,9 @@ def test_contributor_pageview_not_recorded(self, app, mock_save, permissions): ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=contributor.auth) assert resp.status_code == 204 - assert mock_save.call_count == 0 + assert mock_es8.index.call_count == 0 - def test_non_contributor_pageview_recorded(self, app, mock_save): + def test_non_contributor_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() visitor = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=True) @@ -434,9 +476,9 @@ def test_non_contributor_pageview_recorded(self, app, mock_save): ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=visitor.auth) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 - def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, mock_save): + def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() child_owner = AuthUserFactory() parent_reader = AuthUserFactory() @@ -451,9 +493,9 @@ def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=parent_reader.auth) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 - def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_save): + def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=False) link = PrivateLinkFactory(anonymous=True, creator=creator) @@ -468,9 +510,9 @@ def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_save ) resp = app.post_json_api(COUNTED_USAGE_URL, payload) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 - def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, mock_save): + def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() visitor = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=False) @@ -485,14 +527,14 @@ def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, m ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=visitor.auth) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 @pytest.mark.parametrize( 'permissions', [READ, WRITE, ADMIN], ids=['read', 'write', 'admin'], ) - def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, mock_save, permissions): + def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, mock_es8, permissions): creator = AuthUserFactory() contributor = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=False) @@ -508,4 +550,4 @@ def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, m ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=contributor.auth) assert resp.status_code == 204 - assert mock_save.call_count == 0 + assert mock_es8.index.call_count == 0 diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py deleted file mode 100644 index cd9b8041c2d..00000000000 --- a/api_tests/metrics/test_preprint_metrics.py +++ /dev/null @@ -1,240 +0,0 @@ -import pytest -from unittest import mock -from datetime import datetime - -from website.app import setup_django - -setup_django() - -from django.utils import timezone -from waffle.testutils import override_switch -from elasticsearch6.exceptions import RequestError - -from osf import features -from api.base.settings import API_PRIVATE_BASE as API_BASE -from osf.metrics import PreprintDownload, PreprintView -from osf_tests.factories import AuthUserFactory, PreprintFactory, NodeFactory - -pytestmark = pytest.mark.django_db - - -@pytest.mark.django_db -class TestPreprintMetrics: - - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - - @pytest.fixture - def user(self): - user = AuthUserFactory() - user.is_staff = True - user.add_system_tag('preprint_metrics') - user.save() - return user - - @pytest.fixture - def other_user(self): - return AuthUserFactory() - - @pytest.fixture - def other_admin_user(self): - user = AuthUserFactory() - user.is_staff = True - user.save() - return user - - @pytest.fixture - def other_non_admin_user(self): - user = AuthUserFactory() - user.add_system_tag('preprint_metrics') - user.save() - return user - - @pytest.fixture - def preprint(self, user): - preprint = PreprintFactory(creator=user) - return preprint - - @pytest.fixture - def preprint_two(self): - return PreprintFactory() - - @pytest.fixture - def preprint_three(self): - return PreprintFactory() - - @pytest.fixture - def preprint_no_results(self): - return PreprintFactory() - - @pytest.fixture - def project(self): - return NodeFactory() - - @pytest.fixture - def project_two(self): - return NodeFactory() - - @pytest.fixture - def metric_dates(self): - return ['2019-01-01', '2019-01-02', '2019-01-03'] - - def add_views_and_downloads(self, preprint_to_add, user_to_use, dates_to_use): - # create 3 timestamps for 3 days, 1 hour apart - times = ['T00:05', 'T01:05', 'T02:05'] - - metrics = [PreprintView, PreprintDownload] - for metric in metrics: - for date in dates_to_use: - for time in times: - metric.record_for_preprint( - preprint=preprint_to_add, - user=user_to_use, - path=preprint_to_add.primary_file.path, - timestamp=datetime.strptime(date + time, '%Y-%m-%dT%H:%M') - ) - - @pytest.fixture - def base_url(self): - return f'/{API_BASE}metrics/preprints/' - - @mock.patch('api.metrics.views.PreprintDownloadMetrics.execute_search') - def test_custom_metric_malformed_query(self, mock_execute, app, user, base_url): - mock_execute.side_effect = RequestError() - post_url = f'{base_url}downloads/' - post_data = { - 'data': { - 'type': 'preprint_metric', - 'attributes': { - 'query': {'not_a_field': 'Yay!'} - } - } - } - res = app.post_json_api(post_url, post_data, auth=user.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Malformed elasticsearch query.' - - @pytest.mark.es_metrics - def test_agg_query(self, app, user, base_url): - - post_url = f'{base_url}downloads/' - - payload = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': { - 'aggs': { - 'preprints_by_year': { - 'composite': { - 'sources': [{ - 'date': { - 'date_histogram': { - 'field': 'timestamp', - 'interval': 'year' - } - } - }] - } - } - } - } - } - } - } - resp = app.post_json_api(post_url, payload, auth=user.auth) - assert resp.status_code == 200 - - @mock.patch('api.metrics.views.PreprintDownloadMetrics.format_response') - @mock.patch('api.metrics.views.PreprintDownloadMetrics.execute_search') - def test_post_custom_metric(self, mock_execute, mock_format, app, user, base_url, preprint, other_user): - mock_return = {'good': 'job'} - mock_execute.return_value.to_dict.return_value = mock_return - mock_format.return_value = mock_return - post_url = f'{base_url}downloads/' - post_data = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': mock_return - } - } - } - res = app.post_json_api(post_url, post_data, auth=user.auth) - assert res.json == mock_return - - @pytest.mark.parametrize('metric_name', ['downloads', 'views']) - @mock.patch('api.metrics.utils.timezone.now') - def test_preprint_list_with_metrics_fails(self, mock_timezone, app, user, base_url, preprint, preprint_two, - preprint_three, metric_name, other_user, project, project_two, - other_admin_user, other_non_admin_user): - mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc) - url = f'{base_url}{metric_name}/' - - one_preprint_url = f'{url}?guids={preprint._id}' - # test non-logged in cannot access - res = app.get(one_preprint_url, expect_errors=True) - assert res.status_code == 401 - - # test logged in non-metrics, non-admin user cannot access - res = app.get(one_preprint_url, auth=other_user.auth, expect_errors=True) - assert res.status_code == 403 - - # test logged in, non-metrics, admin user cannot access - res = app.get(one_preprint_url, auth=other_admin_user.auth, expect_errors=True) - assert res.status_code == 403 - - # test logged in, metrics, non-admin user cannot access - res = app.get(one_preprint_url, auth=other_non_admin_user.auth, expect_errors=True) - assert res.status_code == 403 - - @pytest.mark.skip('Return results will be entirely mocked so does not make a lot of sense to run on ci.') - @mock.patch('api.metrics.utils.timezone.now') - def test_preprint_with_metrics_succeeds(self, mock_timezone, app, user, base_url, preprint, other_user, - preprint_no_results, metric_dates): - mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc) - self.add_views_and_downloads(preprint, other_user, metric_dates) - metric_name = 'downloads' - - mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc) - url = f'{base_url}{metric_name}/' - one_preprint_url = f'{url}?guids={preprint._id}' - - # base url should return all results - res = app.get(one_preprint_url, auth=user.auth) - assert res.json['metric_type'] == metric_name - assert len(res.json['data']) == 3 - - # starting a day later only returns 2 results - later_url = f'{one_preprint_url}&start_datetime=2019-01-02' - res = app.get(later_url, auth=user.auth) - assert len(res.json['data']) == 2 - datetimes = [result.keys()[0] for result in res.json['data']] - assert '2019-01-01T00:05:00.000Z' not in datetimes - - # filter between two specific datetimes - two_times_url = f'{one_preprint_url}&start_datetime=2019-01-02T00:00&end_datetime=2019-01-02T02:00' - res = app.get(two_times_url, auth=user.auth) - assert len(res.json['data']) == 1 - datetimes = [result.keys()[0] for result in res.json['data']] - assert '2019-01-01T00:05:00.000Z' not in datetimes - assert '2019-01-01T03:05:00.000Z' not in datetimes - - # test two specific datetimes with minute interval - two_min_interval = f'{one_preprint_url}&start_datetime=2019-01-02T00:00&end_datetime=2019-01-02T02:00&interval=1m' - res = app.get(two_min_interval, auth=user.auth) - assert len(res.json['data']) == 61 - first = res.json['data'][0] - last = res.json['data'][-1] - assert first.keys() == ['2019-01-02T00:05:00.000Z'] - assert first['2019-01-02T00:05:00.000Z'] == {preprint._id: 1} - assert last.keys() == ['2019-01-02T01:05:00.000Z'] - assert last['2019-01-02T01:05:00.000Z'] == {preprint._id: 1} - - # make sure requesting one preprint with no results is OK - non_preprint_url = f'{url}?guids={preprint_no_results._id}' - res = app.get(non_preprint_url, auth=user.auth) - assert res.status_code == 200 - assert res.json['data'] == [] diff --git a/api_tests/metrics/test_queries.py b/api_tests/metrics/test_queries.py index 8b19247f5b4..f5c24877c60 100644 --- a/api_tests/metrics/test_queries.py +++ b/api_tests/metrics/test_queries.py @@ -1,111 +1,259 @@ +import datetime from unittest import mock -import pytest +from django.test import TestCase +from elasticsearch_metrics.tests.util import RealElasticTestCase +from osf.metrics.es8_metrics import OsfCountedUsageEvent from osf_tests.factories import NodeFactory, AuthUserFactory -@pytest.mark.django_db -class TestNodeAnalyticsQuery: - @pytest.fixture - def mock_search(self): - with mock.patch('elasticsearch6.Elasticsearch.search', autospec=True) as mock_search: - yield mock_search - @pytest.mark.parametrize('timespan', ['week', 'fortnight', 'month']) - def test_private_node(self, app, mock_search, timespan): - node = NodeFactory(is_public=False) - guid = node._id - resp = app.get( - f'/_/metrics/query/node_analytics/{guid}/{timespan}/', - expect_errors=True, - ) - assert resp.status_code == 401 +class TestNodeAnalyticsQueryErrors: + def test_private_node_anon(self, app): + _node = NodeFactory(is_public=False) + with mock.patch('elasticsearch8.Elasticsearch.search') as _mock_search: + for timespan in ['week', 'fortnight', 'month']: + resp = app.get( + f'/_/metrics/query/node_analytics/{_node._id}/{timespan}/', + expect_errors=True, + ) + assert resp.status_code == 401 + assert _mock_search.call_count == 0 + + def test_private_node_rando(self, app): + _node = NodeFactory(is_public=False) + _user = AuthUserFactory() + with mock.patch('elasticsearch8.Elasticsearch.search') as _mock_search: + for timespan in ['week', 'fortnight', 'month']: + resp = app.get( + f'/_/metrics/query/node_analytics/{_node._id}/{timespan}/', + expect_errors=True, + auth=_user.auth, + ) + assert resp.status_code == 403 + assert _mock_search.call_count == 0 - user = AuthUserFactory() - resp = app.get( - f'/_/metrics/query/node_analytics/{guid}/{timespan}/', - auth=user.auth, - expect_errors=True, - ) - assert resp.status_code == 403 - assert mock_search.call_count == 0 +class TestNodeAnalyticsQuery(RealElasticTestCase, TestCase): + def setUp(self): + super().setUp() + self._node = NodeFactory(is_public=True) + self._osfid = self._node._id + self._today = datetime.date.today() + self._now = datetime.datetime( + self._today.year, + self._today.month, + self._today.day, + 12, + tzinfo=datetime.UTC, + ) + ### + # past week + OsfCountedUsageEvent.record( + sessionhour_id='s1', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(hours=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/there', + 'page_url': 'http://osf.example/page/path', + 'route_name': 'page.route', + 'page_title': 'foo', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s2', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/there', + 'page_url': 'http://osf.example/page/path', + 'route_name': 'page.route', + 'page_title': 'foo', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s3', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=1, hours=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/there', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s4', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=1, hours=2), + pageview_info={ + 'referer_url': 'http://elsewhere.example.com/there', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s5', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=2, hours=1), + pageview_info={ + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s6', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=2, hours=2), + pageview_info={ + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + ### + # past fortnight + OsfCountedUsageEvent.record( + sessionhour_id='s7', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=10, hours=1), + pageview_info={ + 'referer_url': 'http://elsewhere.example.com/there', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + ### + # past month + OsfCountedUsageEvent.record( + sessionhour_id='s8', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=20, hours=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/anothere', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + ### + # older than a month + OsfCountedUsageEvent.record( + sessionhour_id='s9', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=80, hours=7), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/anothere', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + # refresh + OsfCountedUsageEvent.refresh() - @pytest.mark.parametrize('timespan', ['week', 'fortnight', 'month']) - def test_public_node(self, app, mock_search, timespan): - node = NodeFactory(is_public=True) - guid = node._id - mock_search.return_value = { - 'aggregations': { - 'popular-pages': { - 'buckets': [ - { - 'key': '/page/path', - 'doc_count': 17, - 'route-for-path': { - 'buckets': [{'key': 'page.route'}], - }, - 'title-for-path': { - 'buckets': [{'key': 'foo'}], - }, - }, - { - 'key': '/page/another', - 'doc_count': 7, - 'route-for-path': { - 'buckets': [{'key': 'page.another'}], - }, - 'title-for-path': { - 'buckets': [{'key': 'blaz'}], - }, - }, - ], - }, - 'unique-visits': { - 'buckets': [ - {'key': 1646265600000, 'key_as_string': '2022-03-03', 'doc_count': 8}, - {'key': 1646352000000, 'key_as_string': '2022-03-04', 'doc_count': 1}, - ], - }, - 'time-of-day': { - 'buckets': [ - {'key': 8, 'doc_count': 1}, - {'key': 9, 'doc_count': 2}, - {'key': 10, 'doc_count': 3}, - ], - }, - 'referer-domain': { - 'buckets': [ - {'key': 'somewhere.example.com', 'doc_count': 9}, - {'key': 'elsewhere.example.com', 'doc_count': 4}, - ], - }, + def test_public_node(self): + _week_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/week/') + assert _week_resp.json()['data'] == { + 'id': f'{self._osfid}:week', + 'type': 'node-analytics', + 'attributes': { + 'popular_pages': [ + {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 4}, + {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2}, + ], + 'unique_visits': [ + {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2}, + {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3}, + {'date': str(self._today), 'count': 1}, + ], + 'time_of_day': [ + {'hour': 11, 'count': 3}, + {'hour': 10, 'count': 2}, + {'hour': 12, 'count': 1}, + ], + 'referer_domain': [ + {'referer_domain': 'somewhere.example.com', 'count': 3}, + {'referer_domain': 'elsewhere.example.com', 'count': 1}, + ], }, } - resp = app.get(f'/_/metrics/query/node_analytics/{guid}/{timespan}/') - assert resp.json['data'] == { - 'id': f'{guid}:{timespan}', + _fortnight_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/fortnight/') + assert _fortnight_resp.json()['data'] == { + 'id': f'{self._osfid}:fortnight', 'type': 'node-analytics', 'attributes': { 'popular_pages': [ - {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 17}, - {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 7}, + {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 5}, + {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2}, ], 'unique_visits': [ - {'date': '2022-03-03', 'count': 8}, - {'date': '2022-03-04', 'count': 1}, + {'date': str(self._today - datetime.timedelta(days=10)), 'count': 1}, + *( + {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0} + for _n in range(9, 2, -1) + ), + {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2}, + {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3}, + {'date': str(self._today), 'count': 1}, ], 'time_of_day': [ - {'hour': 8, 'count': 1}, - {'hour': 9, 'count': 2}, - {'hour': 10, 'count': 3}, + {'hour': 11, 'count': 4}, + {'hour': 10, 'count': 2}, + {'hour': 12, 'count': 1}, ], 'referer_domain': [ - {'referer_domain': 'somewhere.example.com', 'count': 9}, - {'referer_domain': 'elsewhere.example.com', 'count': 4}, + {'referer_domain': 'somewhere.example.com', 'count': 3}, + {'referer_domain': 'elsewhere.example.com', 'count': 2}, ], }, } - assert mock_search.call_count == 1 + _month_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/month/') + assert _month_resp.json()['data'] == { + 'id': f'{self._osfid}:month', + 'type': 'node-analytics', + 'attributes': { + 'popular_pages': [ + {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 6}, + {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2}, + ], + 'unique_visits': [ + {'date': str(self._today - datetime.timedelta(days=20)), 'count': 1}, + *( + {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0} + for _n in range(19, 10, -1) + ), + {'date': str(self._today - datetime.timedelta(days=10)), 'count': 1}, + *( + {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0} + for _n in range(9, 2, -1) + ), + {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2}, + {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3}, + {'date': str(self._today), 'count': 1}, + ], + 'time_of_day': [ + {'hour': 11, 'count': 5}, + {'hour': 10, 'count': 2}, + {'hour': 12, 'count': 1}, + ], + 'referer_domain': [ + {'referer_domain': 'somewhere.example.com', 'count': 4}, + {'referer_domain': 'elsewhere.example.com', 'count': 2}, + ], + }, + } diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index f5d3a047b10..cda2a03b391 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -1,11 +1,21 @@ import pytest +from elasticsearch_metrics.tests.util import djelme_test_backends +from waffle.testutils import override_switch +from osf import features from osf_tests.factories import RegistrationFactory, AuthUserFactory from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers -from osf.metrics import RegistriesModerationMetrics +from osf.metrics.es8_metrics import RegistriesModerationEventEs8 from tests.utils import capture_notifications -pytestmark = pytest.mark.django_db + +@pytest.fixture +def real_elastic(): + with ( + override_switch(features.ELASTICSEARCH_METRICS, active=True), + djelme_test_backends(), + ): + yield @pytest.mark.django_db @@ -15,8 +25,7 @@ class TestRegistrationModerationMetrics: def registration(self): return RegistrationFactory() - @pytest.mark.es_metrics - def test_record_transitions(self, registration): + def test_record_transitions(self, registration, real_elastic): with capture_notifications(): registration._write_registration_action( RegistrationModerationStates.INITIAL, @@ -24,10 +33,10 @@ def test_record_transitions(self, registration): registration.creator, 'Metrics is easy' ) - RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) + RegistriesModerationEventEs8.refresh() - assert RegistriesModerationMetrics.search().count() == 1 - data = RegistriesModerationMetrics.search().execute()['hits']['hits'][0]['_source'] + assert RegistriesModerationEventEs8.search().count() == 1 + data = RegistriesModerationEventEs8.search().execute()['hits']['hits'][0]['_source'] assert data['from_state'] == RegistrationModerationStates.INITIAL.db_name assert data['to_state'] == RegistrationModerationStates.PENDING.db_name @@ -59,8 +68,7 @@ def other_user(self): def base_url(self): return '/_/metrics/registries_moderation/transitions/' - @pytest.mark.es_metrics - def test_registries_moderation_view(self, app, user, base_url, registration): + def test_registries_moderation_view(self, app, user, base_url, registration, real_elastic): with capture_notifications(): registration._write_registration_action( RegistrationModerationStates.INITIAL, @@ -68,7 +76,7 @@ def test_registries_moderation_view(self, app, user, base_url, registration): registration.creator, 'Metrics is easy' ) - RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) + RegistriesModerationEventEs8.refresh() res = app.get(base_url, auth=user.auth, expect_errors=True) data = res.json diff --git a/api_tests/metrics/test_reports.py b/api_tests/metrics/test_reports.py index db748bdb05b..bebb42059b8 100644 --- a/api_tests/metrics/test_reports.py +++ b/api_tests/metrics/test_reports.py @@ -21,7 +21,7 @@ def mock_domain(self): @pytest.fixture def mock_search(self): - with mock.patch('elasticsearch6.Elasticsearch.search', autospec=True) as mock_search: + with mock.patch('elasticsearch8.Elasticsearch.search', autospec=True) as mock_search: yield mock_search def test_report_names(self, app, mock_domain): @@ -44,11 +44,11 @@ def test_report_names(self, app, mock_domain): @pytest.mark.parametrize('report_name', expected_report_names) def test_recent_reports(self, app, mock_domain, mock_search, report_name): - mock_search.return_value = { + mock_search.return_value.body = { 'hits': { 'hits': [ - {'_id': 'hi-by', '_source': {'report_date': '1234-12-12', 'hello': 'goodbye'}}, - {'_id': 'doof', '_source': {'report_date': '1234-12-11', 'hello': 'upwa'}}, + {'_id': 'hi-by', '_source': {'report_date': '1234-12-12', 'hello': 'goodbye', 'created': '1235-12-13T01:00:00Z'}}, + {'_id': 'doof', '_source': {'report_date': '1234-12-11', 'hello': 'upwa', 'created': '1235-12-12T01:00:00Z'}}, ], }, } @@ -58,17 +58,19 @@ def test_recent_reports(self, app, mock_domain, mock_search, report_name): assert resp.json['data'] == [ { 'id': 'hi-by', - 'type': f'daily-report:{report_name}', + 'type': f'cyclic-report:{report_name}', 'attributes': { 'report_date': '1234-12-12', 'hello': 'goodbye', + 'created': '1235-12-13T01:00:00Z', }, }, { 'id': 'doof', - 'type': f'daily-report:{report_name}', + 'type': f'cyclic-report:{report_name}', 'attributes': { 'report_date': '1234-12-11', 'hello': 'upwa', + 'created': '1235-12-12T01:00:00Z', }, } ] @@ -84,12 +86,12 @@ def test_recent_reports(self, app, mock_domain, mock_search, report_name): assert resp.unicode_body == CSV_REPORTS -TSV_REPORTS = '''report_date hello -1234-12-12 goodbye -1234-12-11 upwa +TSV_REPORTS = '''report_date created hello +1234-12-12 1235-12-13 01:00:00+00:00 goodbye +1234-12-11 1235-12-12 01:00:00+00:00 upwa '''.replace('\n', '\r\n') -CSV_REPORTS = '''report_date,hello -1234-12-12,goodbye -1234-12-11,upwa +CSV_REPORTS = '''report_date,created,hello +1234-12-12,1235-12-13 01:00:00+00:00,goodbye +1234-12-11,1235-12-12 01:00:00+00:00,upwa '''.replace('\n', '\r\n') diff --git a/api_tests/preprints/views/test_preprint_detail_metrics.py b/api_tests/preprints/views/test_preprint_detail_metrics.py index f98777be678..9d945e8159f 100644 --- a/api_tests/preprints/views/test_preprint_detail_metrics.py +++ b/api_tests/preprints/views/test_preprint_detail_metrics.py @@ -17,17 +17,13 @@ def enable_elasticsearch_metrics(self): with override_switch(features.ELASTICSEARCH_METRICS, active=True): yield - @pytest.mark.parametrize(('metric_name', 'metric_class_name'), - [ - ('downloads', 'PreprintDownload'), - ('views', 'PreprintView'), - ]) - def test_preprint_detail_with_downloads(self, app, settings, metric_name, metric_class_name): + @pytest.mark.parametrize('metric_name', ['downloads', 'views']) + def test_preprint_detail_with_downloads(self, app, settings, metric_name): preprint = PreprintFactory() url = f'/{API_BASE}preprints/{preprint._id}/?metrics[{metric_name}]=total' - with mock.patch(f'api.preprints.views.{metric_class_name}.get_count_for_preprint') as mock_get_count_for_preprint: - mock_get_count_for_preprint.return_value = 42 + with mock.patch('api.base.metrics.UsageMetricsViewMixin._get_usage_count') as mock_get_count: + mock_get_count.return_value = 42 res = app.get(url) assert res.status_code == 200 diff --git a/api_tests/preprints/views/test_preprint_list.py b/api_tests/preprints/views/test_preprint_list.py index 3208c397893..15d12079328 100644 --- a/api_tests/preprints/views/test_preprint_list.py +++ b/api_tests/preprints/views/test_preprint_list.py @@ -1,9 +1,8 @@ from unittest import mock -import datetime as dt import pytest from django.utils import timezone -from waffle.testutils import override_switch, override_flag +from waffle.testutils import override_flag from addons.github.models import GithubFile from api.base.settings.defaults import API_BASE @@ -1027,65 +1026,3 @@ def provider(self): @pytest.fixture() def url(self, project): return f'/{API_BASE}preprints/?version=2.2&' - - -@pytest.mark.django_db -class TestPreprintListWithMetrics: - - # enable the ELASTICSEARCH_METRICS switch for all tests - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - - @pytest.mark.parametrize( - ('metric_name', 'metric_class_name'), - [ - ('downloads', 'PreprintDownload'), - ('views', 'PreprintView'), - ], - ) - def test_preprint_list_with_metrics(self, app, metric_name, metric_class_name): - url = f'/{API_BASE}preprints/?metrics[{metric_name}]=total' - preprint1 = PreprintFactory() - preprint1.downloads = 41 - preprint2 = PreprintFactory() - preprint2.downloads = 42 - - with mock.patch(f'api.preprints.views.{metric_class_name}.get_top_by_count') as mock_get_top_by_count: - mock_get_top_by_count.return_value = [preprint2, preprint1] - res = app.get(url) - assert res.status_code == 200 - - preprint_2_data = res.json['data'][0] - assert preprint_2_data['meta']['metrics']['downloads'] == 42 - - preprint_1_data = res.json['data'][1] - assert preprint_1_data['meta']['metrics']['downloads'] == 41 - - @mock.patch('django.utils.timezone.now') - @pytest.mark.parametrize( - ('query_value', 'timedelta'), - [ - ('daily', dt.timedelta(days=1)), - ('weekly', dt.timedelta(days=7)), - ('yearly', dt.timedelta(days=365)), - ], - ) - def test_preprint_list_filter_metric_by_time_period(self, mock_timezone_now, app, settings, query_value, timedelta): - url = f'/{API_BASE}preprints/?metrics[views]={query_value}' - mock_now = dt.datetime.utcnow().replace(tzinfo=timezone.utc) - mock_timezone_now.return_value = mock_now - - preprint1 = PreprintFactory() - preprint1.views = 41 - preprint2 = PreprintFactory() - preprint2.views = 42 - - with mock.patch('api.preprints.views.PreprintView.get_top_by_count') as mock_get_top_by_count: - mock_get_top_by_count.return_value = [preprint2, preprint1] - res = app.get(url) - - assert res.status_code == 200 - call_kwargs = mock_get_top_by_count.call_args[1] - assert call_kwargs['after'] == mock_now - timedelta diff --git a/api_tests/providers/preprints/views/test_preprint_provider_list.py b/api_tests/providers/preprints/views/test_preprint_provider_list.py index c1624fd58f9..21499744d77 100644 --- a/api_tests/providers/preprints/views/test_preprint_provider_list.py +++ b/api_tests/providers/preprints/views/test_preprint_provider_list.py @@ -1,8 +1,5 @@ -from unittest import mock import pytest -from waffle.testutils import override_switch -from osf import features from api.base.settings.defaults import API_BASE from osf_tests.factories import ( AuthUserFactory, @@ -65,28 +62,3 @@ def test_preprint_provider_list_filtering( url, filter_type, filter_value)) assert res.status_code == 200 assert len(res.json['data']) == 1 - - -@pytest.mark.django_db -class TestPreprintProviderListWithMetrics: - - # enable the ELASTICSEARCH_METRICS switch for all tests - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - - def test_preprint_provider_list_with_metrics(self, app, url, provider_one, provider_two): - provider_one.downloads = 41 - provider_two.downloads = 42 - with mock.patch('api.preprints.views.PreprintDownload.get_top_by_count') as mock_get_top_by_count: - mock_get_top_by_count.return_value = [provider_one, provider_two] - res = app.get(url + 'metrics[downloads]=total') - - assert res.status_code == 200 - - provider_2_data = res.json['data'][0] - provider_2_data['meta']['metrics']['downloads'] == 42 - - provider_1_data = res.json['data'][1] - provider_1_data['meta']['metrics']['downloads'] == 41 diff --git a/osf/features.yaml b/osf/features.yaml index cce490a25a4..1da56e44f79 100644 --- a/osf/features.yaml +++ b/osf/features.yaml @@ -93,11 +93,6 @@ switches: name: enable_inactive_schemas note: This is no longer used - - flag_name: COUNTEDUSAGE_UNIFIED_METRICS_2024 - name: countedusage_unified_metrics_2024 - note: use only `osf.metrics.counted_usage`-based metrics where possible; un-use PageCounter, PreprintView, PreprintDownload, etc - active: false - - flag_name: ENABLE_MAILHOG name: enable_mailhog note: This is used to enable the MailHog email testing service, this will allow emails to be sent to the diff --git a/osf/management/commands/make_dummy_pageviews_for_metrics.py b/osf/management/commands/make_dummy_pageviews_for_metrics.py deleted file mode 100644 index 09de34bf7a8..00000000000 --- a/osf/management/commands/make_dummy_pageviews_for_metrics.py +++ /dev/null @@ -1,118 +0,0 @@ -"""osf/management/commands/poke_metrics_timespan_queries.py -""" -import logging -import random -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import CountedAuthUsage - - -logger = logging.getLogger(__name__) - -TIME_FILTERS = ( - {'gte': 'now/d-150d'}, - {'gte': '2021-11-28T23:00:00.000Z', 'lte': '2023-01-16T00:00:00.000Z'}, -) - -PLATFORM_IRI = 'http://localhost:9201/' - -ITEM_GUID = 'foo' - - -class Command(BaseCommand): - - def add_arguments(self, parser): - parser.add_argument( - '--count', - type=int, - default=100, - help='number of fake pageviews to generate', - ) - parser.add_argument( - '--seconds_back', - type=int, - default=60 * 60 * 24 * 14, # up to two weeks back - help='max age in seconds of random event', - ) - - def handle(self, *args, **options): - self._generate_random_countedusage(options.get('count'), options.get('seconds_back')) - - results = [ - self._run_date_query(time_filter) - for time_filter in TIME_FILTERS - ] - - self._print_line( - (str(f) for f in TIME_FILTERS), - label='timefilter:', - ) - - date_keys = { - k - for r in results - for k in r - } - for date_key in sorted(date_keys): - self._print_line( - (r.get(date_key, 0) for r in results), - label=str(date_key), - ) - - def _print_line(self, lineitems, label=''): - print('\t'.join((label, *map(str, lineitems)))) - - def _generate_random_countedusage(self, n, max_age): - now = datetime.datetime.now(tz=datetime.UTC) - for _ in range(n): - seconds_back = random.randint(0, max_age) - timestamp_time = now - datetime.timedelta(seconds=seconds_back) - CountedAuthUsage.record( - platform_iri=PLATFORM_IRI, - timestamp=timestamp_time, - item_guid=ITEM_GUID, - session_id='freshen by key', - user_is_authenticated=bool(random.randint(0, 1)), - item_public=bool(random.randint(0, 1)), - action_labels=[['view', 'download'][random.randint(0, 1)]], - ) - - def _run_date_query(self, time_range_filter): - result = self._run_query({ - 'query': { - 'bool': { - 'filter': { - 'range': { - 'timestamp': time_range_filter, - }, - }, - }, - }, - 'aggs': { - 'by-date': { - 'date_histogram': { - 'field': 'timestamp', - 'interval': 'day', - }, - }, - 'max-timestamp': { - 'max': {'field': 'timestamp'}, - }, - 'min-timestamp': { - 'min': {'field': 'timestamp'}, - }, - }, - }) - return { - 'min': result.aggs['min-timestamp'].value, - 'max': result.aggs['max-timestamp'].value, - **{ - str(bucket.key.date()): bucket.doc_count - for bucket in result.aggs['by-date'] - }, - } - - def _run_query(self, query_dict): - analytics_search = CountedAuthUsage.search().update_from_dict(query_dict) - return analytics_search.execute() diff --git a/osf/management/commands/metrics_backfill_pageviews.py b/osf/management/commands/metrics_backfill_pageviews.py deleted file mode 100644 index 13898037923..00000000000 --- a/osf/management/commands/metrics_backfill_pageviews.py +++ /dev/null @@ -1,203 +0,0 @@ -"""osf/management/commands/metrics_backfill_pageviews.py - -Usage: - - $ dc-manage metrics_backfill_pageviews --source=$path_to_csv - $ dc-manage metrics_backfill_pageviews --source=$path_to_csv --dry # dry run - $ dc-manage metrics_backfill_pageviews --source=$path_to_csv --resume-from 1264 # start from record 1264 - - -""" -import csv -import logging -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import CountedAuthUsage -from osf.models import Guid - -logger = logging.getLogger(__name__) - -def main(source, dry_run=False, resume_from=None): - if not source: - logger.info('No source file detected, exiting.') - return - - # keen.timestamp => _source.timestamp # "2023-01-19T04:06:45.675432+00:00", - # page.info.protocol + page.info.domain => _source.platform_iri # "http://localhost:5000/", - # visitor.session => _source.session_id # "fcae918a3b6a19641bd0087f84083f0d57982d8c93ab821c405561d1b5c7b305", - # user.id => _source.user_is_authenticated # true, - # page.url => _source.pageview_info.page_url # "http://localhost:5000/my-projects/", - # page.title => _source.pageview_info.page_title # "OSF | My Projects", - # referrer.url => _source.pageview_info.referer_url # "http://localhost:5000/csab4/analytics", - # page.meta.routeName => _source.pageview_info.route_name # "OsfWebRenderer.my_projects", - # time.utc.hour_of_day => _source.pageview_info.hour_of_day # 4, - # page.info.path => _source.pageview_info.page_path # "/my-projects", - # referrer.info.domain => _source.pageview_info.referer_domain # "localhost:5000" - # page.meta.public => _source.item_public # true, - # node.id => _source.item_guid # "ry7dn", - - # ??? => _source.provider_id # "osf", - # ??? => _source.item_type # "node" - # ??? => _source.surrounding_guids = # [parent_guids?] - # ??? => _source.action_labels # ["web"] - - count = 0 - reader = csv.DictReader(source) - for row in reader: - if not row['page.url'].startswith('https://staging.osf.io'): - continue - - count += 1 - if resume_from is not None and count < resume_from: - continue - - something_wonderful = { - 'timestamp': _timestamp_to_dt(row['keen.timestamp']), - 'platform_iri': row['page.info.protocol'] + '://' + row['page.info.domain'], - 'session_id': row['visitor.session'], - 'user_is_authenticated': row['user.id'] is not None, - 'item_guid': row['node.id'], - 'item_public': row['page.meta.public'] or row['page.meta.pubic'], # unfortunate misspelling - 'pageview_info': { - 'hour_of_day': row['time.utc.hour_of_day'], - 'page_path': row['page.info.path'], - 'page_title': row['page.title'], - 'page_url': row['page.url'], - 'referer_url': row['referrer.url'], - 'referer_domain': row['referrer.info.domain'], - 'route_name': row['page.meta.routeName'], - }, - } - - db_info = annotate_from_db(row) - if db_info: - something_wonderful.update(db_info) - populate_action_labels(something_wonderful, row) - - logger.info(f'*** {count}: something wonderful:({something_wonderful})') - - if not dry_run: - CountedAuthUsage.record(**something_wonderful) - -def populate_action_labels(something_wonderful, row): - labels = ['web'] - - if row['page.info.path']: - path_parts = row['page.info.path'].split('/') - if len(path_parts) == 1 and path_parts[0] not in ('my-projects', 'goodbye', 'login'): - labels.append('view') - elif path_parts[1] in ('wiki'): - labels.append('view') - - if row['page.meta.routeName']: - route_name = row['page.meta.routeName'] - if 'search' in route_name: - labels.append('search') - - something_wonderful['action_labels'] = labels - -guid_cache = {} -# this may be done by CountedAuthUsage._fill_osfguid_info -def annotate_from_db(row): - item_guid = row['node.id'] - if not item_guid: - return - - if not guid_cache.get(item_guid, None): - guid_info = {} - guid_instance = Guid.load(item_guid) - - if guid_instance and guid_instance.referent: - guid_info = _fill_osfguid_info(guid_instance.referent) - guid_cache[item_guid] = guid_info - - return guid_cache[item_guid] - -# from CountedAuthUsage -def _fill_osfguid_info(guid_referent): - guid_info = {} - guid_info['item_public'] = _get_ispublic(guid_referent) - guid_info['item_type'] = type(guid_referent).__name__.lower() - guid_info['surrounding_guids'] = _get_surrounding_guids(guid_referent) - guid_info['provider_id'] = _get_provider_id(guid_referent) - return guid_info - -def _get_ispublic(guid_referent): - # if it quacks like BaseFileNode, look at .target instead - maybe_public = getattr(guid_referent, 'target', None) or guid_referent - if hasattr(maybe_public, 'verified_publishable'): - return maybe_public.verified_publishable # quacks like Preprint - return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode - -def _get_provider_id(guid_referent): - provider = getattr(guid_referent, 'provider', None) - if isinstance(provider, str): - return provider # quacks like BaseFileNode - elif provider: - return provider._id # quacks like Registration, Preprint, Collection - return 'osf' # quacks like Node, Comment, WikiPage - -def _get_immediate_wrapper(guid_referent): - if hasattr(guid_referent, 'verified_publishable'): - return None # quacks like Preprint - return ( - getattr(guid_referent, 'parent_node', None) # quacks like AbstractNode - or getattr(guid_referent, 'node', None) # quacks like WikiPage, Comment - or getattr(guid_referent, 'target', None) # quacks like BaseFileNode - ) - -def _get_surrounding_guids(guid_referent): - """get all the parent/owner/surrounding guids for the given guid_referent - - @param guid_referent: instance of a model that has GuidMixin - @returns list of str - - For AbstractNode, goes up the node hierarchy up to the root. - For WikiPage or BaseFileNode, grab the node it belongs to and - follow the node hierarchy from there. - """ - surrounding_guids = [] - current_referent = guid_referent - while current_referent: - next_referent = _get_immediate_wrapper(current_referent) - if next_referent: - surrounding_guids.append(next_referent._id) - current_referent = next_referent - return surrounding_guids - -def _timestamp_to_dt(timestamp): - return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=datetime.UTC) - -def _timestamp_to_date(timestamp): - dt_obj = _timestamp_to_dt(timestamp) - return str(dt_obj.date()) - - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--source', - type=open, - help='source file (csv format w/ header line)', - ) - parser.add_argument( - '--dry', - dest='dry', - action='store_true', - help='Dry run' - ) - parser.add_argument( - '--resume-from', - dest='resume_from', - type=int, - help='start from which record', - ) - - def handle(self, *args, **options): - dry_run = options.get('dry', None) - source = options.get('source', None) - resume_from = options.get('resume_from', None) - main(source, dry_run, resume_from) diff --git a/osf/management/commands/metrics_backfill_summaries.py b/osf/management/commands/metrics_backfill_summaries.py deleted file mode 100644 index d259e9b2a52..00000000000 --- a/osf/management/commands/metrics_backfill_summaries.py +++ /dev/null @@ -1,435 +0,0 @@ -"""osf/management/commands/metrics_backfill_summaries.py - -usage: - - $ dc-manage metrics_backfill_summaries --which=$which_metric --source=$path_to_csv - -where ``$which_metric`` is one of: - - file_summary - download_count - preprint_summary - institution_summary - user_summary - node_summary - -""" -import csv -import logging -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import ( - DownloadCountReport, - InstitutionSummaryReport, - # NewUserDomainReport, - NodeSummaryReport, - OsfstorageFileCountReport, - PreprintSummaryReport, - # StorageAddonUsage, - UserSummaryReport, -) - - -logger = logging.getLogger(__name__) - - -def main(source, which, dry_run=False, resume_from=None): - if which not in SUMMARIES: - logger.info(f'No such summary, {which}, exiting.') - return - - if not source: - logger.info('No path to source data file, exiting.') - return - - summary_meta = SUMMARIES[which] - - logger.info('Kicking off...') - with open(source) as csvfile: - reader = csv.DictReader(csvfile) - - count = 0 - for row in reader: - count += 1 - if resume_from is not None and count < resume_from: - continue - - something_wonderful = summary_meta['mapper'](row) - logger.info(f'{count}: transformed:({something_wonderful})') - if not dry_run: - summary_meta['class'].record(**something_wonderful) - - logger.info('All done!') - if which == 'preprint_summary': - logger.error(f'Unrecognized provider names: ({bogus_preprints})') - - -def _map_download_count(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:58:38.041721+00:00" - # files.total => _source.daily_file_downloads # 0, - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'daily_file_downloads': int(row['files.total']), - } - -def _map_file_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:04.397056+00:00" - # osfstorage_files.private => _source.files.private # 12146, - # osfstorage_files.total_daily => _source.files.total_daily # 0, - # osfstorage_files.public_daily => _source.files.public_daily # 0, - # osfstorage_files.private_daily => _source.files.private_daily # 0 - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'files': { - 'total': int(row['osfstorage_files.total']), - 'public': int(row['osfstorage_files.public']), - 'private': int(row['osfstorage_files.private']), - 'total_daily': int(row['osfstorage_files.total_daily']), - 'public_daily': int(row['osfstorage_files.public_daily']), - 'private_daily': int(row['osfstorage_files.private_daily']), - }, - } - - -def _map_institution_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created => _source.timestamp # "2023-01-02T14:59:01.706319+00:00" - # institution.id => _source.institution_id # "okstate", - # institution.name => _source.institution_name # "Oklahoma State University [Test]", - # ### => _source.users # {} - # users.total => _source.total # 0, - # users.total_daily => _source.total_daily # 0 - # ### => _source.nodes # {} - # nodes.total => _source.nodes.total": 0, - # nodes.public => _source.nodes.public": 0, - # nodes.private => _source.nodes.private": 0, - # nodes.total_daily => _source.nodes.total_daily": 0, - # nodes.public_daily => _source.nodes.public_daily": 0, - # nodes.private_daily => _source.nodes.private_daily": 0 - # ### => _source.projects # {} - # projects.total => _source.projects.total": 0, - # projects.public => _source.projects.public": 0, - # projects.private => _source.projects.private": 0, - # projects.total_daily => _source.projects.total_daily": 0, - # projects.public_daily => _source.projects.public_daily": 0, - # projects.private_daily => _source.projects.private_daily": 0 - # ### => _source.registered_nodes # {} - # registered_nodes.total => _source.registered_nodes.total": 0, - # registered_nodes.public => _source.registered_nodes.public": 0, - # registered_nodes.embargoed => _source.registered_nodes.embargoed": 0, - # registered_nodes.embargoed_v2 => _source.registered_nodes.embargoed_v2": 0, - # registered_nodes.total_daily => _source.registered_nodes.total_daily": 0, - # registered_nodes.public_daily => _source.registered_nodes.public_daily": 0, - # registered_nodes.embargoed_daily => _source.registered_nodes.embargoed_daily": 0, - # registered_nodes.embargoed_v2_daily => _source.registered_nodes.embargoed_v2_daily": 0 - # ### => _source.registered_projects # {} - # registered_projects.total => _source.registered_projects.total": 0, - # registered_projects.public => _source.registered_projects.public": 0, - # registered_projects.embargoed => _source.registered_projects.embargoed": 0, - # registered_projects.embargoed_v2 => _source.registered_projects.embargoed_v2": 0, - # registered_projects.total_daily => _source.registered_projects.total_daily": 0, - # registered_projects.public_daily => _source.registered_projects.public_daily": 0, - # registered_projects.embargoed_daily => _source.registered_projects.embargoed_daily": 0, - # registered_projects.embargoed_v2_daily => _source.registered_projects.embargoed_v2_daily": 0 - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'institution_id': row['institution.id'], - 'institution_name': row['institution.name'], - 'users': { - 'total': int(row['users.total']), - 'total_daily': int(row['users.total_daily'] or 0), - }, - 'nodes': { - 'total': int(row['nodes.total']), - 'public': int(row['nodes.public']), - 'private': int(row['nodes.private']), - 'total_daily': int(row['nodes.total_daily'] or 0), - 'public_daily': int(row['nodes.public_daily'] or 0), - 'private_daily': int(row['nodes.private_daily'] or 0), - }, - 'projects': { - 'total': int(row['projects.total']), - 'public': int(row['projects.public']), - 'private': int(row['projects.private']), - 'total_daily': int(row['projects.total_daily'] or 0), - 'public_daily': int(row['projects.public_daily'] or 0), - 'private_daily': int(row['projects.private_daily'] or 0), - }, - 'registered_nodes': { - 'total': int(row['registered_nodes.total']), - 'public': int(row['registered_nodes.public']), - 'embargoed': int(row['registered_nodes.embargoed']), - 'embargoed_v2': int(row['registered_nodes.embargoed_v2'] or 0), - 'total_daily': int(row['registered_nodes.total_daily'] or 0), - 'public_daily': int(row['registered_nodes.public_daily'] or 0), - 'embargoed_daily': int(row['registered_nodes.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_nodes.embargoed_v2_daily'] or 0), - }, - 'registered_projects': { - 'total': int(row['registered_projects.total']), - 'public': int(row['registered_projects.public']), - 'embargoed': int(row['registered_projects.embargoed']), - 'embargoed_v2': int(row['registered_projects.embargoed_v2'] or 0), - 'total_daily': int(row['registered_projects.total_daily'] or 0), - 'public_daily': int(row['registered_projects.public_daily'] or 0), - 'embargoed_daily': int(row['registered_projects.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_projects.embargoed_v2_daily'] or 0), - }, - } - -def _map_node_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:03.886999+00:00" - # ### => _source.nodes # {} - # nodes.total => _source.nodes.total # 58, - # nodes.total_excluding_spam => _source.nodes.total_excluding_spam # 58, - # nodes.public => _source.nodes.public # 14, - # nodes.private => _source.nodes.private # 44, - # nodes.total_daily => _source.nodes.total_daily # 0, - # nodes.total_daily_excluding_spam => _source.nodes.total_daily_excluding_spam # 0, - # nodes.public_daily => _source.nodes.public_daily # 0, - # nodes.private_daily => _source.nodes.private_daily # 0 - # ### => _source.projects # {} - # projects.total => _source.projects.total # 53, - # projects.total_excluding_spam => _source.projects.total_excluding_spam # 53, - # projects.public => _source.projects.public # 14, - # projects.private => _source.projects.private # 39, - # projects.total_daily => _source.projects.total_daily # 0, - # projects.total_daily_excluding_spam => _source.projects.total_daily_excluding_spam # 0, - # projects.public_daily => _source.projects.public_daily # 0, - # projects.private_daily => _source.projects.private_daily # 0 - # ### => _source.registered_nodes # {} - # registered_nodes.total => _source.registered_nodes.total # 10, - # registered_nodes.public => _source.registered_nodes.public # 9, - # registered_nodes.embargoed => _source.registered_nodes.embargoed # 1, - # registered_nodes.embargoed_v2 => _source.registered_nodes.embargoed_v2 # 0, - # registered_nodes.withdrawn => _source.registered_nodes.withdrawn # 0, - # registered_nodes.total_daily => _source.registered_nodes.total_daily # 0, - # registered_nodes.public_daily => _source.registered_nodes.public_daily # 0, - # registered_nodes.embargoed_daily => _source.registered_nodes.embargoed_daily # 0, - # registered_nodes.embargoed_v2_daily => _source.registered_nodes.embargoed_v2_daily # 0, - # registered_nodes.withdrawn_daily => _source.registered_nodes.withdrawn_daily # 0 - # ### => _source.registered_projects # {} - # registered_projects.total => _source.registered_projects."total # 10, - # registered_projects.public => _source.registered_projects."public # 9, - # registered_projects.embargoed => _source.registered_projects."embargoed # 1, - # registered_projects.embargoed_v2 => _source.registered_projects."embargoed_v2 # 0, - # registered_projects.withdrawn => _source.registered_projects."withdrawn # 0, - # registered_projects.total_daily => _source.registered_projects."total_daily # 0, - # registered_projects.public_daily => _source.registered_projects."public_daily # 0, - # registered_projects.embargoed_daily => _source.registered_projects."embargoed_daily # 0, - # registered_projects.embargoed_v2_daily => _source.registered_projects."embargoed_v2_daily # 0, - # registered_projects.withdrawn_daily => _source.registered_projects."withdrawn_daily # 0 - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'nodes': { - 'total': int(row['nodes.total'] or 0), - 'total_excluding_spam': int(row['nodes.total_excluding_spam'] or 0), - 'public': int(row['nodes.public'] or 0), - 'private': int(row['nodes.private'] or 0), - 'total_daily': int(row['nodes.total_daily'] or 0), - 'total_daily_excluding_spam': int(row['nodes.total_daily_excluding_spam'] or 0), - 'public_daily': int(row['nodes.public_daily'] or 0), - 'private_daily': int(row['nodes.private_daily'] or 0), - }, - 'projects': { - 'total': int(row['projects.total']), - 'total_excluding_spam': int(row['projects.total_excluding_spam'] or 0), - 'public': int(row['projects.public'] or 0), - 'private': int(row['projects.private'] or 0), - 'total_daily': int(row['projects.total_daily'] or 0), - 'total_daily_excluding_spam': int(row['projects.total_daily_excluding_spam'] or 0), - 'public_daily': int(row['projects.public_daily'] or 0), - 'private_daily': int(row['projects.private_daily'] or 0), - }, - 'registered_nodes': { - 'total': int(row['registered_nodes.total'] or 0), - 'public': int(row['registered_nodes.public'] or 0), - 'embargoed': int(row['registered_nodes.embargoed'] or 0), - 'embargoed_v2': int(row['registered_nodes.embargoed_v2'] or 0), - 'withdrawn': int(row['registered_nodes.withdrawn'] or 0), - 'total_daily': int(row['registered_nodes.total_daily'] or 0), - 'public_daily': int(row['registered_nodes.public_daily'] or 0), - 'embargoed_daily': int(row['registered_nodes.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_nodes.embargoed_v2_daily'] or 0), - 'withdrawn_daily': int(row['registered_nodes.withdrawn_daily'] or 0), - }, - 'registered_projects': { - 'total': int(row['registered_projects.total'] or 0), - 'public': int(row['registered_projects.public'] or 0), - 'embargoed': int(row['registered_projects.embargoed'] or 0), - 'embargoed_v2': int(row['registered_projects.embargoed_v2'] or 0), - 'withdrawn': int(row['registered_projects.withdrawn'] or 0), - 'total_daily': int(row['registered_projects.total_daily'] or 0), - 'public_daily': int(row['registered_projects.public_daily'] or 0), - 'embargoed_daily': int(row['registered_projects.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_projects.embargoed_v2_daily'] or 0), - 'withdrawn_daily': int(row['registered_projects.withdrawn_daily'] or 0), - }, - } - - -preprint_name_map = { - 'AfricArXiv': 'africarxiv', - 'AgriXiv': 'agrixiv', - 'Arabixiv': 'arabixiv', - 'BioHackrXiv': 'biohackrxiv', - 'BITSS': 'metaarxiv', - 'BodoArXiv': 'bodoarxiv', - 'coppreprints': 'coppreprints', - 'EarthArXiv': 'eartharxiv', - 'EcoEvoRxiv': 'ecoevorxiv', - 'ECSarXiv': 'ecsarxiv', - 'EdArXiv': 'edarxiv', - 'engrXiv': 'engrxiv', - 'FocUS Archive': 'focusarchive', - 'Frenxiv': 'frenxiv', - 'INA-Rxiv': 'inarxiv', - 'IndiaRxiv': 'indiarxiv', - 'LawArXiv': 'lawarxiv', - 'LIS Scholarship Archive': 'lissa', - 'LiveData': 'livedata', - 'Research AZ': 'livedata', - 'MarXiv': 'marxiv', - 'MedArXiv': 'medarxiv', - 'MediArXiv': 'mediarxiv', - 'MetaArXiv': 'metaarxiv', - 'MindRxiv': 'mindrxiv', - 'NutriXiv': 'nutrixiv', - 'Open Science Framework': 'osf', - 'PaleorXiv': 'paleorxiv', - 'PsyArXiv': 'psyarxiv', - 'SocArXiv': 'socarxiv', - 'SportRxiv': 'sportrxiv', - 'Thesis Commons': 'thesiscommons', - 'Vulnerability Assessment Testing': 'vulnerabilityassessmenttesting', -} -preprint_long_names = list(preprint_name_map.keys()) -preprint_short_names = list(preprint_name_map.values()) -bogus_preprints = {} -def _map_preprint_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:05.684642+00:00" - # provider.name => _source.provider_key # "psyarxiv", - # provider.total => _source.preprint_count # 0, - - # normalize provider names: we used to store the formal name, now we store the short name - provider_key = None - provider_name = row['provider.name'] - if provider_name in preprint_short_names: - provider_key = provider_name - elif provider_name in preprint_long_names: - provider_key = preprint_name_map[provider_name] - else: - logger.error(f'Unrecognized preprint provider name: ({provider_name})') - if provider_name not in bogus_preprints: - bogus_preprints[provider_name] = 0 - bogus_preprints[provider_name] += 1 - provider_key = provider_name # oh well - - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'provider_key': provider_key, - 'preprint_count': int(row['provider.total']), - } - -def _map_user_summary(row): - # date(keen.timestamp) => _source.report_date # "2023-01-03", - # keen.created_at => _source.timestamp # "2023-01-04T13:47:34.216419+00:00" - # status.active => _source.active # 7, - # status.deactivated => _source.deactivated # 0, - # status.merged => _source.merged # 0, - # status.new_users_daily => _source.new_users_daily # 0, - # status.new_users_with_institution_daily => _source.new_users_with_institution_daily # 0, - # status.unconfirmed => _source.unconfirmed # 0, - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'active': int(row['status.active']), - 'deactivated': int(row['status.deactivated'] or 0), - 'merged': int(row['status.merged'] or 0), - 'new_users_daily': int(row['status.new_users_daily'] or 0), - 'new_users_with_institution_daily': int(row['status.new_users_with_institution_daily'] or 0), - 'unconfirmed': int(row['status.unconfirmed'] or 0), - } - -SUMMARIES = { - 'download_count': { - 'mapper': _map_download_count, - 'class': DownloadCountReport, - }, - 'file_summary': { - 'mapper': _map_file_summary, - 'class': OsfstorageFileCountReport, - }, - 'institution_summary': { - 'mapper': _map_institution_summary, - 'class': InstitutionSummaryReport, - }, - 'node_summary': { - 'mapper': _map_node_summary, - 'class': NodeSummaryReport, - }, - 'preprint_summary': { - 'mapper': _map_preprint_summary, - 'class': PreprintSummaryReport, - }, - 'user_summary': { - 'mapper': _map_user_summary, - 'class': UserSummaryReport, - }, -} - -def _timestamp_to_dt(timestamp): - return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') - -def _timestamp_to_date(timestamp): - dt_obj = _timestamp_to_dt(timestamp) - return dt_obj.date() - - -def _dt_to_date(dt): - dt_obj = datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S.%fZ') - return str(dt_obj.date()) - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--source', - type=str, - help='source file path (csv format w/ header line)', - ) - parser.add_argument( - '--dry', - dest='dry', - action='store_true', - help='Dry run' - ) - parser.add_argument( - '--which', - type=str, - help='which metric summary this data is for' - ) - parser.add_argument( - '--resume-from', - dest='resume_from', - type=int, - help='start from which record', - ) - - def handle(self, *args, **options): - dry_run = options.get('dry', None) - source = options.get('source', None) - which = options.get('which', None) - resume_from = options.get('resume_from', None) - main(source, which, dry_run, resume_from) diff --git a/osf/management/commands/metrics_backfill_user_domains.py b/osf/management/commands/metrics_backfill_user_domains.py deleted file mode 100644 index 685dd55243e..00000000000 --- a/osf/management/commands/metrics_backfill_user_domains.py +++ /dev/null @@ -1,130 +0,0 @@ -"""osf/management/commands/metrics_backfill_user_domains.py - -Usage: - - $ dc-manage metrics_backfill_user_domains --source=$path_to_csv - $ dc-manage metrics_backfill_user_domains --source=$path_to_csv --dry # dry run - $ dc-manage metrics_backfill_user_domains --source=$path_to_csv --resume-from 1264 # start from record 1264 - - -""" -import csv -import logging -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import NewUserDomainReport - -logger = logging.getLogger(__name__) - -def main(source, dry_run=False, resume_from=None): - if not source: - logger.info('No source file detected, exiting.') - return - - # new user domains report is weird, b/c old data needs to be aggregated by date & domain - - count = 0 - reader = csv.DictReader(source) - tally = {} - this_year = None - for row in reader: - count += 1 - if resume_from is not None and count < resume_from: - continue - - logger.info(f'count:({count}) this_year:({this_year})') - - event_ts = _timestamp_to_dt(row['keen.timestamp']) - event_date = event_ts.date() - event_date_str = str(event_date) - - if this_year is None: - logger.info(' >>> setting new year') - this_year = event_date.year - - if this_year != event_date.year: - # we've built up a year of data; commit and clear - logger.info(' >>> year is up, committing data') - _upload_data_and_purge(tally, dry_run) - this_year = event_date.year - logger.info(' >>> data committed, new year is:({}) and tally should be ' - 'empty:({})'.format(this_year, tally)) - - if event_date_str not in tally: - tally[event_date_str] = { - 'timestamp': event_ts, - 'report_date': event_date, - 'domains': {}, - } - - domain = row['domain'] - if domain not in tally[event_date_str]['domains']: - tally[event_date_str]['domains'][domain] = 0 - tally[event_date_str]['domains'][domain] += 1 - - _upload_data_and_purge(tally, dry_run) - - -def _upload_data_and_purge(tally, dry_run): - for event_date_str, record in tally.items(): - for domain, count in record['domains'].items(): - - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:05.684642+00:00" - # domain => _source.domain_name # metrics.Keyword() - # count_agg(domain) => _source.new_user_count # metrics.Integer() - - something_wonderful = { - 'timestamp': record['timestamp'], - 'report_date': record['report_date'], - 'domain_name': domain, - 'new_user_count': count, - } - - logger.info(f' *** {event_date_str}::{domain}::{count}') - logger.info(' *** {}::{}: something wonderful:({})'.format(event_date_str, domain, - something_wonderful)) - - if not dry_run: - NewUserDomainReport.record(**something_wonderful) - - # purge tally - tally.clear() - - -def _timestamp_to_dt(timestamp): - return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=datetime.UTC) - -def _timestamp_to_date(timestamp): - dt_obj = _timestamp_to_dt(timestamp) - return str(dt_obj.date()) - - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--source', - type=open, - help='source file (csv format w/ header line)', - ) - parser.add_argument( - '--dry', - dest='dry', - action='store_true', - help='Dry run' - ) - parser.add_argument( - '--resume-from', - dest='resume_from', - type=int, - help='start from which record', - ) - - def handle(self, *args, **options): - dry_run = options.get('dry', None) - source = options.get('source', None) - resume_from = options.get('resume_from', None) - main(source, dry_run, resume_from) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index efa37547396..a3f9e6f30af 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -12,7 +12,10 @@ osfid_from_iri, ) from osf.metrics.counted_usage import _get_surrounding_guids -from osf.metrics.utils import YearMonth +from osf.metrics.utils import ( + YearMonth, + cycle_coverage_yearmonth, +) from osf import models as osfdb from osf.models.base import osfid_iri from website import settings as website_settings @@ -208,7 +211,7 @@ def _autofill_item_public(self): def _autofill_item_type(self): if self.item_osfid and not self.item_type: - self.item_type = osfmap_type(self._osfid_referent) + self.item_type = str(osfmap_type(self._osfid_referent)) def _autofill_provider_id(self): if self.item_osfid and not self.provider_id: @@ -221,16 +224,15 @@ def _autofill_provider_id(self): self.provider_id = _provider._id # quacks like Registration, Preprint, Collection def _autofill_within_iris(self): - if self.item_osfid and (self.within_iris is None) and self._osfid_referent: + if self.item_osfid and (not self.within_iris) and self._osfid_referent: self.within_iris = [ osfid_iri(_osfid) for _osfid in _get_surrounding_guids(self._osfid_referent) ] # ensure inclusive "within" - if not self.within_iris: - self.within_iris = [self.item_iri] if self.item_iri not in self.within_iris: self.within_iris = [self.item_iri, *self.within_iris] + self.within_iris = sorted(self.within_iris) def _autofill_pageview(self): # autofill pageview_info fields from other fields @@ -333,9 +335,55 @@ class UsageByStorageAddon(esdsl.InnerDoc): # Cyclic reports -class DailyStorageAddonUsageReportEs8(djelme.CyclicRecord): +class BaseMonthlyReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + + class Meta: + abstract = True + + @classmethod + def most_recent_cycle(cls, base_search=None) -> str | None: + _search = base_search or cls.search() + _search = _search[0:0] # omit hits + _search.aggs.bucket( + 'agg_most_recent_cycle', + 'terms', + field='cycle_coverage', + order={'_key': 'desc'}, + size=1, + ) + _response = _search.execute() + if not _response.aggregations: + return None + _buckets = _response.aggregations.agg_most_recent_cycle.buckets + if not _buckets: + return None + return _buckets[0].key + + def __init__(self, *, report_yearmonth=None, **kwargs): + super().__init__(**kwargs) + # separate out report_yearmonth, so the property setter gets used + if report_yearmonth is not None: + self.report_yearmonth = report_yearmonth + + @property + def report_yearmonth(self): + _year, _month = self.cycle_coverage.split('.') + return YearMonth(_year, _month) + + @report_yearmonth.setter + def report_yearmonth(self, ym): + self.cycle_coverage = cycle_coverage_yearmonth(YearMonth.from_any(ym)) + + +class BaseDailyReport(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY + class Meta: + abstract = True + + +class DailyStorageAddonUsageReportEs8(BaseDailyReport): usage_by_addon: list[UsageByStorageAddon] class Meta: @@ -343,9 +391,7 @@ class Meta: timeseries_recordtype_name = 'DailyStorageAddonUsageReport' -class DailyDownloadCountReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - +class DailyDownloadCountReportEs8(BaseDailyReport): daily_file_downloads: int class Meta: @@ -353,8 +399,7 @@ class Meta: timeseries_recordtype_name = 'DailyDownloadCountReport' -class DailyInstitutionSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY +class DailyInstitutionSummaryReportEs8(BaseDailyReport): UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) institution_id: str @@ -370,8 +415,7 @@ class Meta: timeseries_recordtype_name = 'DailyInstitutionSummaryReport' -class DailyNewUserDomainReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY +class DailyNewUserDomainReportEs8(BaseDailyReport): UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) domain_name: str @@ -382,9 +426,7 @@ class Meta: timeseries_recordtype_name = 'DailyNewUserDomainReport' -class DailyNodeSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - +class DailyNodeSummaryReportEs8(BaseDailyReport): nodes: NodeRunningTotals projects: NodeRunningTotals registered_nodes: RegistrationRunningTotals @@ -395,9 +437,7 @@ class Meta: timeseries_recordtype_name = 'DailyNodeSummaryReport' -class DailyOsfstorageFileCountReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - +class DailyOsfstorageFileCountReportEs8(BaseDailyReport): files: FileRunningTotals class Meta: @@ -405,9 +445,7 @@ class Meta: timeseries_recordtype_name = 'DailyOsfstorageFileCountReport' -class DailyPreprintSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - +class DailyPreprintSummaryReportEs8(BaseDailyReport): UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) provider_key: str preprint_count: int @@ -417,9 +455,7 @@ class Meta: timeseries_recordtype_name = 'DailyPreprintSummaryReport' -class DailyUserSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - +class DailyUserSummaryReportEs8(BaseDailyReport): active: int deactivated: int merged: int @@ -432,9 +468,7 @@ class Meta: timeseries_recordtype_name = 'DailyUserSummaryReport' -class MonthlySpamSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - +class MonthlySpamSummaryReportEs8(BaseMonthlyReport): node_confirmed_spam: int node_confirmed_ham: int node_flagged: int @@ -452,8 +486,7 @@ class Meta: timeseries_recordtype_name = 'MonthlySpamSummaryReport' -class MonthlyInstitutionalUserReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY +class MonthlyInstitutionalUserReportEs8(BaseMonthlyReport): UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) institution_id: str @@ -479,8 +512,7 @@ class Meta: timeseries_recordtype_name = 'MonthlyInstitutionalUserReport' -class MonthlyInstitutionSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY +class MonthlyInstitutionSummaryReportEs8(BaseMonthlyReport): UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) institution_id: str @@ -500,8 +532,7 @@ class Meta: timeseries_recordtype_name = 'MonthlyInstitutionSummaryReport' -class MonthlyPublicItemUsageReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY +class MonthlyPublicItemUsageReportEs8(BaseMonthlyReport): UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_iri') # where noted, fields are meant to correspond to defined terms from COUNTER @@ -532,9 +563,7 @@ class Meta: timeseries_recordtype_name = 'MonthlyPublicItemUsageReport' -class MonthlyPrivateSpamMetricsReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - +class MonthlyPrivateSpamMetricsReportEs8(BaseMonthlyReport): node_oopspam_flagged: int node_oopspam_hammed: int node_akismet_flagged: int diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py index 88d8e1fb891..495dac6fdc6 100644 --- a/osf/metrics/reporters/institution_summary_monthly.py +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -6,7 +6,6 @@ from addons.osfstorage.models import OsfStorageFile from osf.metrics.reports import InstitutionMonthlySummaryReport from osf.metrics.es8_metrics import MonthlyInstitutionSummaryReportEs8 -from osf.metrics.utils import cycle_coverage_yearmonth from ._base import MonthlyReporter class InstitutionalSummaryMonthlyReporter(MonthlyReporter): @@ -35,7 +34,7 @@ def generate_report(self, institution): preprint_queryset = self.get_published_preprints(institution, self.yearmonth) reports = [] report_es8 = MonthlyInstitutionSummaryReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + report_yearmonth=self.yearmonth, institution_id=institution._id, user_count=institution.get_institution_users().count(), private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False), diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index a9fba3adfcb..a36b7130a83 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -7,7 +7,7 @@ from osf.models.spam import SpamStatus from addons.osfstorage.models import OsfStorageFile from osf.metrics.reports import InstitutionalUserReport -from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth +from osf.metrics.utils import YearMonth from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8 from ._base import MonthlyReporter @@ -52,7 +52,7 @@ class _InstiUserReportHelper: def build_reports(self): _affiliation = self.user.get_institution_affiliation(self.institution._id) report_es8 = MonthlyInstitutionalUserReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + report_yearmonth=self.yearmonth, institution_id=self.institution._id, user_id=self.user._id, user_name=self.user.fullname, diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index fde545247e6..08ea28f49d5 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -2,7 +2,6 @@ from osf.external.oopspam.client import OOPSpamClient from osf.external.askismet.client import AkismetClient from osf.metrics.es8_metrics import MonthlyPrivateSpamMetricsReportEs8 -from osf.metrics.utils import cycle_coverage_yearmonth from ._base import MonthlyReporter @@ -19,7 +18,7 @@ def report(self): reports = [] report_es8 = MonthlyPrivateSpamMetricsReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + report_yearmonth=self.yearmonth, node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'), diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py index c32d318126c..d409a4b7331 100644 --- a/osf/metrics/reporters/public_item_usage.py +++ b/osf/metrics/reporters/public_item_usage.py @@ -2,29 +2,14 @@ import datetime import typing -import waffle +from elasticsearch8 import dsl as esdsl -from osf.metrics.es8_metrics import MonthlyPublicItemUsageReportEs8 - -if typing.TYPE_CHECKING: - import elasticsearch6_dsl as edsl - -import osf.features from osf.metadata.osf_gathering import OsfmapPartition -from osf.metrics.counted_usage import ( - CountedAuthUsage, - get_item_type, - get_provider_id, +from osf.metrics.es8_metrics import ( + MonthlyPublicItemUsageReportEs8, + OsfCountedUsageEvent, ) -from osf.metrics.preprint_metrics import ( - PreprintDownload, - PreprintView, -) -from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth -from osf import models as osfdb -from osf.models.base import osfid_iri -from website import settings as website_settings from ._base import MonthlyReporter @@ -43,60 +28,20 @@ class PublicItemUsageReporter(MonthlyReporter): includes projects, project components, registrations, registration components, and preprints ''' def iter_report_kwargs(self, continue_after: dict | None = None): - _after_osfid = continue_after['osfid'] if continue_after else None - for _osfid in _zip_sorted( - self._countedusage_osfids(_after_osfid), - self._preprintview_osfids(_after_osfid), - self._preprintdownload_osfids(_after_osfid), - ): - yield {'osfid': _osfid} + _after_item_iri = continue_after['item_iri'] if continue_after else None + for _item_iri in self._each_item_iri(_after_item_iri): + yield {'item_iri': _item_iri} def report(self, **report_kwargs): - _osfid = report_kwargs['osfid'] - # get usage metrics from several sources: - # - osf.metrics.counted_usage: - # - views and downloads for each item (using `CountedAuthUsage.item_guid`) - # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) - # - osf.metrics.preprint_metrics: - # - preprint views and downloads - # - PageCounter? (no) + _item_iri = report_kwargs['item_iri'] try: - _guid = osfdb.Guid.load(_osfid) - if _guid is None or _guid.referent is None: - raise _SkipItem - _obj = _guid.referent - _report = self._init_report(_obj) - self._fill_report_counts(_report, _obj) - if not any(( - _report.view_count, - _report.view_session_count, - _report.download_count, - _report.download_session_count, - )): - raise _SkipItem - _report_es6 = PublicItemUsageReport( - item_osfid=_report.item_osfids[0], - item_type=list(_report.item_types), - provider_id=list(_report.provider_ids), - platform_iri=list(_report.platform_iris), - view_count=_report.view_count, - view_session_count=_report.view_session_count, - download_count=_report.download_count, - download_session_count=_report.download_session_count, - ) - return [_report, _report_es6] + return [self._build_report(_item_iri)] except _SkipItem: return [] def followup_task(self, report): _last_month = YearMonth.from_date(datetime.date.today()).prior() - if isinstance(report, MonthlyPublicItemUsageReportEs8): - _is_last_month = (report.cycle_coverage == cycle_coverage_yearmonth(_last_month)) - elif isinstance(report, PublicItemUsageReport): - return None # followup for only one of the two reports - else: - raise ValueError(report) - if _is_last_month: + if report.report_yearmonth == _last_month: from api.share.utils import task__update_share return task__update_share.signature( args=(report.item_osfids[0],), @@ -107,211 +52,121 @@ def followup_task(self, report): countdown=30, # give index time to settle ) - def _countedusage_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + def _each_item_iri(self, after_item_iri: str | None) -> typing.Iterator[str]: _search = self._base_usage_search() _search.aggs.bucket( - 'agg_osfid', + 'agg_item_iri', 'composite', - sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], + sources=[{'item_iri': {'terms': {'field': 'item_iri'}}}], size=_CHUNK_SIZE, ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) + return _iter_composite_bucket_keys(_search, 'agg_item_iri', 'item_iri', after=after_item_iri) - def _preprintview_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: - _search = ( - PreprintView.search() - .filter('range', timestamp={ - 'gte': self.yearmonth.month_start(), - 'lt': self.yearmonth.month_end(), - }) - .extra(size=0) # only aggregations, no hits - ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], - size=_CHUNK_SIZE, + def _build_report(self, item_iri) -> MonthlyPublicItemUsageReportEs8: + # get usage metrics from OsfCountedUsageEvent: + # - views of the item and its components and files (matching `within_iris`) + # - downloads for each item (matching `item_iri`) + _search = self._build_usage_counts_search(item_iri) + _response = _search.execute() + _views_bucket = _response.aggregations.agg_by_label.buckets.views + _downloads_bucket = _response.aggregations.agg_by_label.buckets.downloads + _fields_agg = _response.aggregations.agg_for_terms + _report = MonthlyPublicItemUsageReportEs8( + report_yearmonth=self.yearmonth, + item_iri=item_iri, + item_osfids=_bucket_keys(_fields_agg.item_osfids.buckets), + database_iris=_bucket_keys(_fields_agg.database_iris.buckets), + platform_iris=_bucket_keys(_fields_agg.platform_iris.buckets), + provider_ids=_bucket_keys(_fields_agg.provider_ids.buckets), + item_types=_bucket_keys(_fields_agg.item_types.buckets), + view_count=_views_bucket.doc_count, + view_session_count=_views_bucket.agg_session_count.value, + download_count=_downloads_bucket.doc_count, + download_session_count=_downloads_bucket.agg_session_count.value, + # same as non-cumulative counts, unless there's a prior report (added below) + cumulative_view_count=_views_bucket.doc_count, + cumulative_view_session_count=_views_bucket.agg_session_count.value, + cumulative_download_count=_downloads_bucket.doc_count, + cumulative_download_session_count=_downloads_bucket.agg_session_count.value, ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: - _search = ( - PreprintDownload.search() + _prior = self._prior_usage_report(item_iri) + if _prior is not None: + _report.cumulative_view_count += _prior.cumulative_view_count + _report.cumulative_view_session_count += _prior.cumulative_view_session_count + _report.cumulative_download_count += _prior.cumulative_download_count + _report.cumulative_download_session_count += _prior.cumulative_download_session_count + return _report + + def _base_usage_search(self): + return ( + OsfCountedUsageEvent.search() + .filter('term', item_public=True) .filter('range', timestamp={ - 'gte': self.yearmonth.month_start(), 'lt': self.yearmonth.month_end(), + 'gte': self.yearmonth.month_start() }) .extra(size=0) # only aggregations, no hits ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], - size=_CHUNK_SIZE, - ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - def _init_report(self, osf_obj) -> MonthlyPublicItemUsageReportEs8: - if not _is_item_public(osf_obj): - raise _SkipItem - return MonthlyPublicItemUsageReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), - item_iri=osfid_iri(osf_obj._id), - item_osfids=[osf_obj._id], - item_types=[get_item_type(osf_obj)], - provider_ids=[get_provider_id(osf_obj)], - platform_iris=[website_settings.DOMAIN], - # leave counts null; will be set if there's data - ) - - def _fill_report_counts(self, report, osf_obj): - if ( - isinstance(osf_obj, osfdb.Preprint) - and not waffle.switch_is_active(osf.features.COUNTEDUSAGE_UNIFIED_METRICS_2024) # type: ignore[attr-defined] - ): - # note: no session-count info in preprint metrics - report.view_count = PreprintView.get_count_for_preprint( - preprint=osf_obj, - after=self.yearmonth.month_start(), - before=self.yearmonth.month_end(), - ) - report.download_count = PreprintDownload.get_count_for_preprint( - preprint=osf_obj, - after=self.yearmonth.month_start(), - before=self.yearmonth.month_end(), - ) - report.cumulative_view_count = PreprintView.get_count_for_preprint( - preprint=osf_obj, - before=self.yearmonth.month_end(), - ) - report.cumulative_download_count = PreprintDownload.get_count_for_preprint( - preprint=osf_obj, - before=self.yearmonth.month_end(), - ) - else: - ( - report.view_count, - report.view_session_count, - ) = self._countedusage_view_counts(osf_obj, cumulative=False) - ( - report.download_count, - report.download_session_count, - ) = self._countedusage_download_counts(osf_obj, cumulative=False) - - ( - report.cumulative_view_count, - report.cumulative_view_session_count, - ) = self._countedusage_view_counts(osf_obj, cumulative=True) - - ( - report.cumulative_download_count, - report.cumulative_download_session_count, - ) = self._countedusage_download_counts(osf_obj, cumulative=True) - - def _base_usage_search(self, cumulative: bool = False): - timestamp_filter = { - 'lt': self.yearmonth.month_end(), - } - if not cumulative: - timestamp_filter['gte'] = self.yearmonth.month_start() - return ( - CountedAuthUsage.search() - .filter('term', item_public=True) - .filter('range', timestamp=timestamp_filter) - .extra(size=0) # only aggregations, no hits - ) - def _countedusage_view_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]: - '''compute view_session_count separately to avoid double-counting - - (the same session may be represented in both the composite agg on `item_guid` - and that on `surrounding_guids`) + def _build_usage_counts_search(self, item_iri, cumulative: bool = False) -> tuple[int, int]: + '''get usage counts for the given item_iri ''' - _search = ( - self._base_usage_search(cumulative=cumulative) - .query( - 'bool', - filter=[ - {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}}, - ], - should=[ - {'term': {'item_guid': osf_obj._id}}, - {'term': {'surrounding_guids': osf_obj._id}}, - ], - minimum_should_match=1, - ) - ) - _search.aggs.metric( + _search = self._base_usage_search().filter('term', within_iris=item_iri) + + # aggregation for counts by action label (views, downloads) + _agg_by_label = esdsl.A('filters', filters={ + # bucket for views (including items within) + 'views': {'term': {'action_labels': OsfCountedUsageEvent.ActionLabel.VIEW.value}}, + # bucket for downloads (excluding items within) + 'downloads': { + 'bool': { + 'filter': [ + {'term': {'action_labels': OsfCountedUsageEvent.ActionLabel.DOWNLOAD.value}}, + {'term': {'item_iri': item_iri}}, + ], + }, + }, + }) + # session count for each label bucket + _agg_by_label.metric( 'agg_session_count', 'cardinality', - field='session_id', + field='sessionhour_id', precision_threshold=_MAX_CARDINALITY_PRECISION, ) - _response = _search.execute() - _view_count = _response.hits.total - _view_session_count = ( - _response.aggregations.agg_session_count.value - if 'agg_session_count' in _response.aggregations - else 0 - ) - return (_view_count, _view_session_count) + _search.aggs.bucket('agg_by_label', _agg_by_label) - def _countedusage_download_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]: - '''aggregate downloads on each osfid (not including components/files)''' - _search = ( - self._base_usage_search(cumulative=cumulative) - .filter('term', item_guid=osf_obj._id) - .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value) - ) - # agg: get download session count - _search.aggs.metric( - 'agg_session_count', - 'cardinality', - field='session_id', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - _response = _search.execute() - _download_count = _response.hits.total - _download_session_count = ( - _response.aggregations.agg_session_count.value - if 'agg_session_count' in _response.aggregations - else 0 - ) - return (_download_count, _download_session_count) + # aggregation for getting terms used on usage events directly on the item + # (excluding items within) -- usually one value per field per item, but could be more + _agg_for_terms = esdsl.A('filter', term={'item_iri': item_iri}) + _agg_for_terms.bucket('item_osfids', esdsl.A('terms', field='item_osfid')) + _agg_for_terms.bucket('item_types', esdsl.A('terms', field='item_type')) + _agg_for_terms.bucket('platform_iris', esdsl.A('terms', field='platform_iri')) + _agg_for_terms.bucket('database_iris', esdsl.A('terms', field='database_iri')) + _agg_for_terms.bucket('provider_ids', esdsl.A('terms', field='provider_id')) + _search.aggs.bucket('agg_for_terms', _agg_for_terms) + return _search -def _is_item_public(osfid_referent) -> bool: - if isinstance(osfid_referent, osfdb.Preprint): - return bool(osfid_referent.verified_publishable) # quacks like Preprint - return getattr(osfid_referent, 'is_public', False) # quacks like AbstractNode + def _prior_usage_report(self, item_iri): + _search = ( + MonthlyPublicItemUsageReportEs8.search() + .filter('term', item_iri=item_iri) + .filter('range', cycle_coverage={ + 'lt': cycle_coverage_yearmonth(self.yearmonth), + }) + .sort('-cycle_coverage') # most recent first + ) + _response = _search[0].execute() + return _response[0] if _response else None -def _zip_sorted( - *iterators: typing.Iterator[str], -) -> typing.Iterator[str]: - '''loop thru multiple iterators on sorted (ascending) sequences of strings - ''' - _nexts = { # holds the next value from each iterator, or None - _i: next(_iter, None) - for _i, _iter in enumerate(iterators) - } - while True: - _nonnull_nexts = [ - _next - for _next in _nexts.values() - if _next is not None - ] - if not _nonnull_nexts: - return # all done - _value = min(_nonnull_nexts) - yield _value - for _i, _iter in enumerate(iterators): - if _nexts[_i] == _value: - _nexts[_i] = next(_iter, None) +def _bucket_keys(buckets): + return [_bucket['key'] for _bucket in buckets] def _iter_composite_bucket_keys( - search: edsl.Search, + search: esdsl.Search, composite_agg_name: str, composite_source_name: str, after: str | None = None, diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 2fbac671ad1..4e35958d257 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -4,7 +4,6 @@ from osf.models import PreprintLog, NodeLog from osf.models.spam import SpamStatus from osf.metrics.es8_metrics import MonthlySpamSummaryReportEs8 -from osf.metrics.utils import cycle_coverage_yearmonth from ._base import MonthlyReporter class SpamCountReporter(MonthlyReporter): @@ -15,7 +14,7 @@ def report(self, **report_kwargs): next_month = self.yearmonth.month_end() reports = [] report_es8 = MonthlySpamSummaryReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + report_yearmonth=self.yearmonth, node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, created__gt=target_month, diff --git a/osf_tests/management_commands/test_monthly_reporters_go.py b/osf_tests/management_commands/test_monthly_reporters_go.py index 505e7adf4bd..c9be02b7fe3 100644 --- a/osf_tests/management_commands/test_monthly_reporters_go.py +++ b/osf_tests/management_commands/test_monthly_reporters_go.py @@ -5,18 +5,16 @@ from elasticsearch_metrics.tests.util import djelme_test_backends from framework.celery_tasks import app as celery_app -from osf.metrics import reports as es6_reports from osf.metrics.es8_metrics import ( MonthlyInstitutionSummaryReportEs8, MonthlyInstitutionalUserReportEs8, MonthlyPrivateSpamMetricsReportEs8, MonthlyPublicItemUsageReportEs8, MonthlySpamSummaryReportEs8, + OsfCountedUsageEvent, ) -from osf.metrics.counted_usage import CountedAuthUsage from osf.metrics.utils import YearMonth from osf_tests import factories -from website import settings as website_settings class TestMonthlyReportersGo(TestCase): @@ -33,21 +31,12 @@ def setUp(self): _user.add_or_update_affiliated_institution(_inst) # set up for public item usage report _reg = factories.RegistrationFactory(is_public=True) - CountedAuthUsage.record( - platform_iri=website_settings.DOMAIN, - item_guid=_reg._id, - session_id='blarg', - user_is_authenticated=True, + OsfCountedUsageEvent.record( + item_osfid=_reg._id, action_labels=['view', 'web'], + user_id=_user._id, ) - CountedAuthUsage._get_connection().indices.refresh(CountedAuthUsage._template_pattern) - # TODO when switching to use es8 data - # OsfCountedUsageEvent.record( - # item_osfid=_preprint._id, - # action_labels=['view', 'web'], - # user_id=_user._id, - # ) - # OsfCountedUsageEvent.refresh() + OsfCountedUsageEvent.refresh() def test_for_smoke(self): self._assert_count(MonthlyInstitutionSummaryReportEs8, 0) @@ -55,22 +44,12 @@ def test_for_smoke(self): self._assert_count(MonthlyPrivateSpamMetricsReportEs8, 0) self._assert_count(MonthlyPublicItemUsageReportEs8, 0) self._assert_count(MonthlySpamSummaryReportEs8, 0) - self._assert_count(es6_reports.SpamSummaryReport, 0) - self._assert_count(es6_reports.InstitutionalUserReport, 0) - self._assert_count(es6_reports.InstitutionMonthlySummaryReport, 0) - self._assert_count(es6_reports.PublicItemUsageReport, 0) - self._assert_count(es6_reports.PrivateSpamMetricsReport, 0) call_command('monthly_reporters_go', yearmonth=str(self._report_yearmonth)) self._assert_count(MonthlyInstitutionSummaryReportEs8, 1) self._assert_count(MonthlyInstitutionalUserReportEs8, 1) self._assert_count(MonthlyPrivateSpamMetricsReportEs8, 1) self._assert_count(MonthlyPublicItemUsageReportEs8, 1) self._assert_count(MonthlySpamSummaryReportEs8, 1) - self._assert_count(es6_reports.SpamSummaryReport, 1) - self._assert_count(es6_reports.InstitutionalUserReport, 1) - self._assert_count(es6_reports.InstitutionMonthlySummaryReport, 1) - self._assert_count(es6_reports.PublicItemUsageReport, 1) - self._assert_count(es6_reports.PrivateSpamMetricsReport, 1) def _assert_count(self, recordtype, expected_count): if hasattr(recordtype, 'refresh'): diff --git a/osf_tests/metrics/reporters/_testutils.py b/osf_tests/metrics/reporters/_testutils.py index 3275b0f1651..ef504c06a18 100644 --- a/osf_tests/metrics/reporters/_testutils.py +++ b/osf_tests/metrics/reporters/_testutils.py @@ -1,8 +1,9 @@ +from elasticsearch_metrics.imps.elastic8 import CyclicRecord + from osf.metrics.reporters._base import MonthlyReporter -from osf.metrics.reports import MonthlyReport -def list_monthly_reports(reporter: MonthlyReporter, *, flat=False) -> list[MonthlyReport]: +def list_monthly_reports(reporter: MonthlyReporter) -> list[CyclicRecord]: _each_reports_list = ( reporter.report(**_kwargs) for _kwargs in reporter.iter_report_kwargs() @@ -11,5 +12,4 @@ def list_monthly_reports(reporter: MonthlyReporter, *, flat=False) -> list[Month _report for _reports_list in _each_reports_list for _report in _reports_list - if isinstance(_report, MonthlyReport) # TODO: update tests with es8 ] diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py index 02c24d86f3c..5075b404d2b 100644 --- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -80,19 +80,19 @@ def _create_active_user(cls, institution, date_confirmed): def test_report_generation(self): reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) reports = list_monthly_reports(reporter) - self.assertEqual(len(reports), 1) - report = reports[0] - self.assertEqual(report.institution_id, self._institution._id) - self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user - self.assertEqual(report.public_project_count, 1) - self.assertEqual(report.private_project_count, 1) - self.assertEqual(report.public_registration_count, 1) - self.assertEqual(report.embargoed_registration_count, 1) - self.assertEqual(report.published_preprint_count, 1) - self.assertEqual(report.storage_byte_count, 1337) # test value for one file - self.assertEqual(report.public_file_count, 1) - self.assertEqual(report.monthly_logged_in_user_count, 1) - self.assertEqual(report.monthly_active_user_count, 1) + self.assertEqual(len(reports), 2) + for report in reports: + self.assertEqual(report.institution_id, self._institution._id) + self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user + self.assertEqual(report.public_project_count, 1) + self.assertEqual(report.private_project_count, 1) + self.assertEqual(report.public_registration_count, 1) + self.assertEqual(report.embargoed_registration_count, 1) + self.assertEqual(report.published_preprint_count, 1) + self.assertEqual(report.storage_byte_count, 1337) # test value for one file + self.assertEqual(report.public_file_count, 1) + self.assertEqual(report.monthly_logged_in_user_count, 1) + self.assertEqual(report.monthly_active_user_count, 1) def test_report_generation_multiple_institutions(self): institution2 = InstitutionFactory() @@ -115,25 +115,27 @@ def test_report_generation_multiple_institutions(self): # Run the reporter for the current month (February 2018) reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) reports = list_monthly_reports(reporter) - self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 + self.assertEqual(len(reports), 6) # Reports for self._institution, institution2, institution3 # Extract reports by institution - report_institution = next(r for r in reports if r.institution_id == self._institution._id) - report_institution2 = next(r for r in reports if r.institution_id == institution2._id) + _reports1 = [r for r in reports if r.institution_id == self._institution._id] + _reports2 = [r for r in reports if r.institution_id == institution2._id] # Validate report for self._institution - self.assertEqual(report_institution.public_project_count, 1) - self.assertEqual(report_institution.private_project_count, 1) - self.assertEqual(report_institution.user_count, 2) - self.assertEqual(report_institution.monthly_active_user_count, 1) - self.assertEqual(report_institution.monthly_logged_in_user_count, 1) + for _report in _reports1: + self.assertEqual(_report.public_project_count, 1) + self.assertEqual(_report.private_project_count, 1) + self.assertEqual(_report.user_count, 2) + self.assertEqual(_report.monthly_active_user_count, 1) + self.assertEqual(_report.monthly_logged_in_user_count, 1) # Validate report for institution2 - self.assertEqual(report_institution2.public_project_count, 1) - self.assertEqual(report_institution2.private_project_count, 0) - self.assertEqual(report_institution2.user_count, 1) - self.assertEqual(report_institution2.monthly_active_user_count, 1) - self.assertEqual(report_institution2.monthly_logged_in_user_count, 0) # No logged-in users + for _report in _reports2: + self.assertEqual(_report.public_project_count, 1) + self.assertEqual(_report.private_project_count, 0) + self.assertEqual(_report.user_count, 1) + self.assertEqual(_report.monthly_active_user_count, 1) + self.assertEqual(_report.monthly_logged_in_user_count, 0) # No logged-in users class TestSummaryMonthlyReporterBenchmarker(TestCase): @@ -264,7 +266,6 @@ def test_high_counts_multiple_institutions(self): reporter_start_time = time.time() reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) reports = list_monthly_reports(reporter) - assert len(reports) == additional_institution_count + 1 if enable_benchmarking: reporter_end_time = time.time() @@ -272,7 +273,7 @@ def test_high_counts_multiple_institutions(self): total_end_time = time.time() self.logger.info(f"Total test execution time: {total_end_time - total_start_time:.2f} seconds") - self.assertEqual(len(reports), additional_institution_count + 1) + self.assertEqual(len(reports), 2 * (additional_institution_count + 1)) # Validate counts for each institution expected_count = users_per_institution * objects_per_user diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py index e399d848396..2037af11583 100644 --- a/osf_tests/metrics/reporters/test_institutional_users_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -8,7 +8,6 @@ from api_tests.utils import create_test_file from osf import models as osfdb from osf.management.commands.populate_notification_types import populate_notification_types -from osf.metrics.reports import InstitutionalUserReport from osf.metrics.reporters import InstitutionalUsersReporter from osf.metrics.utils import YearMonth from osf_tests.factories import ( @@ -48,7 +47,7 @@ def setUpTestData(cls): ) cls._user_setup_with_stuff.fill_uncounted_objects() - def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _InstiUserSetup): + def _assert_report_matches_setup(self, report, setup: _InstiUserSetup): self.assertEqual(report.institution_id, setup.institution._id) # user info: self.assertEqual(report.user_id, setup.user._id) @@ -76,19 +75,19 @@ def test_no_users(self): def test_one_user_with_nothing(self): self._user_setup_with_nothing.affiliate_user() _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self.assertEqual(len(_reports), 1) + self.assertEqual(len(_reports), 2) self._assert_report_matches_setup(_reports[0], self._user_setup_with_nothing) def test_one_user_with_ones(self): self._user_setup_with_ones.affiliate_user() _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self.assertEqual(len(_reports), 1) + self.assertEqual(len(_reports), 2) self._assert_report_matches_setup(_reports[0], self._user_setup_with_ones) def test_one_user_with_stuff_and_no_files(self): self._user_setup_with_stuff.affiliate_user() _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self.assertEqual(len(_reports), 1) + self.assertEqual(len(_reports), 2) self._assert_report_matches_setup(_reports[0], self._user_setup_with_stuff) self.assertEqual(_reports[0].public_file_count, 2) # preprint 2 files self.assertEqual(_reports[0].storage_byte_count, 2674) # preprint bytes @@ -99,10 +98,12 @@ def test_one_user_with_stuff_and_a_file(self): _project = _user.nodes.first() with _patch_now(self._now): create_test_file(target=_project, user=_user, size=37) - (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self._assert_report_matches_setup(_report, self._user_setup_with_stuff) - self.assertEqual(_report.public_file_count, 3) # 2 preprint files - self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 2) + for _report in _reports: + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 3) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files def test_one_user_with_stuff_and_multiple_files(self): self._user_setup_with_stuff.affiliate_user() @@ -116,10 +117,12 @@ def test_one_user_with_stuff_and_multiple_files(self): create_test_file(target=_component, user=_user, size=53, filename='bla') create_test_file(target=_component, user=_user, size=51, filename='blar') create_test_file(target=_component, user=_user, size=47, filename='blarg') - (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self._assert_report_matches_setup(_report, self._user_setup_with_stuff) - self.assertEqual(_report.public_file_count, 7) # 2 preprint files - self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 2) + for _report in _reports: + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 7) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 def test_several_users(self): _setups = [ @@ -134,7 +137,7 @@ def test_several_users(self): for _setup in _setups } _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self.assertEqual(len(_reports), len(_setup_by_userid)) + self.assertEqual(len(_reports), 2 * len(_setup_by_userid)) for _actual_report in _reports: _setup = _setup_by_userid[_actual_report.user_id] self._assert_report_matches_setup(_actual_report, _setup) diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py index 69bd266285a..6b1fe7e90ec 100644 --- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -1,246 +1,295 @@ from datetime import datetime, timedelta +from functools import cached_property from operator import attrgetter from unittest import mock -import pytest +from django.test import TestCase +from elasticsearch_metrics.tests.util import RealElasticTestCase -from osf.metrics.counted_usage import CountedAuthUsage -from osf.metrics.preprint_metrics import ( - PreprintDownload, - PreprintView, +from osf.metadata.rdfutils import OSF +from osf.metrics.es8_metrics import ( + OsfCountedUsageEvent, + MonthlyPublicItemUsageReportEs8, ) from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter -from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth -from osf import models as osfdb from osf_tests import factories from ._testutils import list_monthly_reports -@pytest.mark.es_metrics -@pytest.mark.django_db -class TestPublicItemUsageReporter: - @pytest.fixture(autouse=True) - def _patch_settings(self): - with mock.patch('website.settings.DOMAIN', 'http://osf.example'): - yield +class TestPublicItemUsageReporter(RealElasticTestCase, TestCase): + def setUp(self): + super().setUp() + self.enterContext(mock.patch('website.settings.DOMAIN', 'http://osf.example/')) - @pytest.fixture + @cached_property def item0(self): _item0 = factories.PreprintFactory(is_public=True, set_guid='item0') return _item0 - @pytest.fixture + @cached_property def item1(self): _item1 = factories.ProjectFactory(is_public=True) _item1._id = 'item1' return _item1 - @pytest.fixture - def item2(self, item1): - _item2 = factories.ProjectFactory(is_public=True, parent=item1) + @cached_property + def item2(self): + _item2 = factories.ProjectFactory(is_public=True, parent=self.item1) _item2._id = 'item2' return _item2 - @pytest.fixture + @cached_property def ym_empty(self) -> YearMonth: return YearMonth(2012, 7) - @pytest.fixture + @cached_property def ym_sparse(self) -> YearMonth: return YearMonth(2017, 7) - @pytest.fixture + @cached_property def ym_busy(self) -> YearMonth: return YearMonth(2023, 7) - @pytest.fixture - def sparse_month_usage(self, ym_sparse, item0, item1, item2): + def _setup_sparse_month_usage(self): # "sparse" month: # item0: 3 views, 0 downloads, 2 sessions # item1: 1 views, 1 download, 1 session (plus 1 view from child item2) # item2: 1 views, 0 downloads, 1 session - _month_start = ym_sparse.month_start() + _month_start = self.ym_sparse.month_start() _save_usage( - item0, + self.item0, timestamp=_month_start, - session_id='sesh0', + sessionhour_id='sesh0', action_labels=['view'], ) _save_usage( - item0, + self.item0, timestamp=_month_start + timedelta(minutes=2), - session_id='sesh0', + sessionhour_id='sesh0', action_labels=['view'], ) _save_usage( - item1, + self.item1, timestamp=_month_start + timedelta(minutes=3), - session_id='sesh0', + sessionhour_id='sesh0', action_labels=['download'], ) _save_usage( - item0, + self.item0, timestamp=_month_start + timedelta(days=17), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['view'], ) _save_usage( - item1, + self.item1, timestamp=_month_start + timedelta(days=17, minutes=3), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['view'], ) _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=17, minutes=5), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['view'], ) _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=17, minutes=11), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['download'], ) - @pytest.fixture - def busy_month_item0(self, ym_busy, item0): + def _setup_busy_month_item0(self): # item0: 4 sessions, 4*7 views, 4*5 downloads - _month_start = ym_busy.month_start() + _month_start = self.ym_busy.month_start() for _sesh in range(0, 4): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 7): _save_usage( - item0, + self.item0, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh0{_sesh}', + sessionhour_id=f'sesh0{_sesh}', action_labels=['view'], ) for _minute in range(10, 15): _save_usage( - item0, + self.item0, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh0{_sesh}', + sessionhour_id=f'sesh0{_sesh}', action_labels=['download'], ) + # plus prior report with cumulative counts: + # 4 views, 3 view sessions, 2 downloads, 1 download session + MonthlyPublicItemUsageReportEs8.record( + report_yearmonth=self.ym_busy.prior(), + item_iri='http://osf.example/item0_v1', + item_osfids=['item0_v1'], + item_types=[OSF.Preprint], + platform_iris=['http://osf.example/'], + database_iris=[self.item0.provider.get_semantic_iri()], + provider_ids=[self.item0.provider._id], + view_count=1, + view_session_count=1, + cumulative_view_count=4, + cumulative_view_session_count=3, + download_count=2, + download_session_count=1, + cumulative_download_count=2, + cumulative_download_session_count=1, + ) - @pytest.fixture - def busy_month_item1(self, ym_busy, item1): + def _setup_busy_month_item1(self): # item1: 10 sessions, 6*9 views, 5*7 downloads # (plus 11 views in 11 sessions from child item2) - _month_start = ym_busy.month_start() + _month_start = self.ym_busy.month_start() for _sesh in range(0, 6): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 9): _save_usage( - item1, + self.item1, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh1{_sesh}', + sessionhour_id=f'sesh1{_sesh}', action_labels=['view'], ) for _sesh in range(5, 10): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(10, 17): _save_usage( - item1, + self.item1, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh1{_sesh}', + sessionhour_id=f'sesh1{_sesh}', action_labels=['download'], ) - @pytest.fixture - def busy_month_item2(self, ym_busy, item2): + def _setup_busy_month_item2(self): # item2: 11 sessions, 11 views, 11 downloads (child of item1) - _month_start = ym_busy.month_start() + _month_start = self.ym_busy.month_start() for _sesh in range(1, 12): _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=_sesh), - session_id=f'sesh2{_sesh}', + sessionhour_id=f'sesh2{_sesh}', action_labels=['view'], ) _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), - session_id=f'sesh2{_sesh}', + sessionhour_id=f'sesh2{_sesh}', action_labels=['download'], ) - def test_no_data(self, ym_empty): - _reporter = PublicItemUsageReporter(ym_empty) + def test_no_data(self): + _reporter = PublicItemUsageReporter(self.ym_empty) _empty = list_monthly_reports(_reporter) assert _empty == [] - def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0): - _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty)) - _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) - _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy)) + def test_reporter(self): + self._setup_sparse_month_usage() + self._setup_busy_month_item0() + self._setup_busy_month_item1() + self._setup_busy_month_item2() + OsfCountedUsageEvent.refresh() + + _empty = list_monthly_reports(PublicItemUsageReporter(self.ym_empty)) + _sparse = list_monthly_reports(PublicItemUsageReporter(self.ym_sparse)) + _busy = list_monthly_reports(PublicItemUsageReporter(self.ym_busy)) # empty month: assert _empty == [] # sparse month: assert len(_sparse) == 3 - _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_osfid')) + _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_iri')) # sparse-month item0 - assert isinstance(_sparse_item0, PublicItemUsageReport) - assert _sparse_item0.item_osfid == 'item0_v1' - assert _sparse_item0.provider_id == [item0.provider._id] - assert _sparse_item0.platform_iri == ['http://osf.example'] + assert isinstance(_sparse_item0, MonthlyPublicItemUsageReportEs8) + assert _sparse_item0.item_iri == 'http://osf.example/item0_v1' + assert _sparse_item0.item_osfids == ['item0_v1'] + assert _sparse_item0.provider_ids == [self.item0.provider._id] + assert _sparse_item0.platform_iris == ['http://osf.example'] assert _sparse_item0.view_count == 3 - assert _sparse_item0.view_session_count is None # no session count for preprints + assert _sparse_item0.view_session_count == 2 assert _sparse_item0.download_count == 0 - assert _sparse_item0.download_session_count is None # no session count for preprints + assert _sparse_item0.download_session_count == 0 + assert _sparse_item0.cumulative_view_count == 3 + assert _sparse_item0.cumulative_view_session_count == 2 + assert _sparse_item0.cumulative_download_count == 0 + assert _sparse_item0.cumulative_download_session_count == 0 # sparse-month item1 - assert isinstance(_sparse_item1, PublicItemUsageReport) - assert _sparse_item1.item_osfid == 'item1' - assert _sparse_item1.provider_id == ['osf'] - assert _sparse_item1.platform_iri == ['http://osf.example'] + assert isinstance(_sparse_item1, MonthlyPublicItemUsageReportEs8) + assert _sparse_item1.item_iri == 'http://osf.example/item1' + assert _sparse_item1.item_osfids == ['item1'] + assert _sparse_item1.provider_ids == ['osf'] + assert _sparse_item1.platform_iris == ['http://osf.example'] assert _sparse_item1.view_count == 2 # including item2 assert _sparse_item1.view_session_count == 1 # including item2 assert _sparse_item1.download_count == 1 # NOT including item2 assert _sparse_item1.download_session_count == 1 # NOT including item2 + assert _sparse_item1.cumulative_view_count == 2 + assert _sparse_item1.cumulative_view_session_count == 1 + assert _sparse_item1.cumulative_download_count == 1 + assert _sparse_item1.cumulative_download_session_count == 1 # sparse-month item2 - assert isinstance(_sparse_item1, PublicItemUsageReport) - assert _sparse_item2.item_osfid == 'item2' - assert _sparse_item2.provider_id == ['osf'] - assert _sparse_item2.platform_iri == ['http://osf.example'] + assert isinstance(_sparse_item1, MonthlyPublicItemUsageReportEs8) + assert _sparse_item2.item_iri == 'http://osf.example/item2' + assert _sparse_item2.item_osfids == ['item2'] + assert _sparse_item2.provider_ids == ['osf'] + assert _sparse_item2.platform_iris == ['http://osf.example'] assert _sparse_item2.view_count == 1 assert _sparse_item2.view_session_count == 1 assert _sparse_item2.download_count == 1 assert _sparse_item2.download_session_count == 1 + assert _sparse_item2.cumulative_view_count == 1 + assert _sparse_item2.cumulative_view_session_count == 1 + assert _sparse_item2.cumulative_download_count == 1 + assert _sparse_item2.cumulative_download_session_count == 1 # busy month: assert len(_busy) == 3 - _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid')) - # busy-month item0 - assert isinstance(_busy_item0, PublicItemUsageReport) - assert _busy_item0.item_osfid == 'item0_v1' - assert _busy_item0.provider_id == [item0.provider._id] - assert _busy_item0.platform_iri == ['http://osf.example'] + _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_iri')) + # busy-month item0 (plus prior-month report) + assert isinstance(_busy_item0, MonthlyPublicItemUsageReportEs8) + assert _busy_item0.item_iri == 'http://osf.example/item0_v1' + assert _busy_item0.item_osfids == ['item0_v1'] + assert _busy_item0.provider_ids == [self.item0.provider._id] + assert _busy_item0.platform_iris == ['http://osf.example'] assert _busy_item0.view_count == 4 * 7 - assert _busy_item0.view_session_count is None # no session count for preprints + assert _busy_item0.view_session_count == 4 assert _busy_item0.download_count == 4 * 5 - assert _busy_item0.download_session_count is None # no session count for preprints + assert _busy_item0.download_session_count == 4 + # plus values from prior report: + assert _busy_item0.cumulative_view_count == (4 * 7) + 4 + assert _busy_item0.cumulative_view_session_count == 4 + 3 + assert _busy_item0.cumulative_download_count == (4 * 5) + 2 + assert _busy_item0.cumulative_download_session_count == 4 + 1 # busy-month item1 - assert isinstance(_busy_item1, PublicItemUsageReport) - assert _busy_item1.item_osfid == 'item1' - assert _busy_item1.provider_id == ['osf'] - assert _busy_item1.platform_iri == ['http://osf.example'] + assert isinstance(_busy_item1, MonthlyPublicItemUsageReportEs8) + assert _busy_item1.item_iri == 'http://osf.example/item1' + assert _busy_item1.item_osfids == ['item1'] + assert _busy_item1.provider_ids == ['osf'] + assert _busy_item1.platform_iris == ['http://osf.example'] assert _busy_item1.view_count == 6 * 9 + 11 assert _busy_item1.view_session_count == 6 + 11 assert _busy_item1.download_count == 5 * 7 assert _busy_item1.download_session_count == 5 + assert _busy_item1.cumulative_view_count == 6 * 9 + 11 + assert _busy_item1.cumulative_view_session_count == 6 + 11 + assert _busy_item1.cumulative_download_count == 5 * 7 + assert _busy_item1.cumulative_download_session_count == 5 # busy-month item2 - assert isinstance(_busy_item2, PublicItemUsageReport) - assert _busy_item2.item_osfid == 'item2' - assert _busy_item2.provider_id == ['osf'] - assert _busy_item2.platform_iri == ['http://osf.example'] + assert isinstance(_busy_item2, MonthlyPublicItemUsageReportEs8) + assert _busy_item2.item_iri == 'http://osf.example/item2' + assert _busy_item2.item_osfids == ['item2'] + assert _busy_item2.provider_ids == ['osf'] + assert _busy_item2.platform_iris == ['http://osf.example'] assert _busy_item2.view_count == 11 assert _busy_item2.view_session_count == 11 assert _busy_item2.download_count == 11 assert _busy_item2.download_session_count == 11 + assert _busy_item2.cumulative_view_count == 11 + assert _busy_item2.cumulative_view_session_count == 11 + assert _busy_item2.cumulative_download_count == 11 + assert _busy_item2.cumulative_download_session_count == 11 def _save_usage( @@ -252,32 +301,9 @@ def _save_usage( ): _countedusage_kwargs = { 'timestamp': timestamp, - 'item_guid': item._id, + 'item_osfid': item._id, 'action_labels': action_labels, 'platform_iri': 'http://osf.example', **kwargs, } - CountedAuthUsage(**_countedusage_kwargs).save(refresh=True) - if isinstance(item, osfdb.Preprint): - if 'view' in action_labels: - _save_preprint_view(item, timestamp) - if 'download' in action_labels: - _save_preprint_download(item, timestamp) - - -def _save_preprint_view(preprint, timestamp): - PreprintView( - timestamp=timestamp, - count=1, - preprint_id=preprint._id, - provider_id=preprint.provider._id, - ).save(refresh=True) - - -def _save_preprint_download(preprint, timestamp): - PreprintDownload( - timestamp=timestamp, - count=1, - preprint_id=preprint._id, - provider_id=preprint.provider._id, - ).save(refresh=True) + OsfCountedUsageEvent.record(**_countedusage_kwargs) diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 220d03ba2e6..168b57c0bae 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -2144,8 +2144,6 @@ def from_node_usage(cls, usage_bytes, private_limit=None, public_limit=None): CAS_LOG_LEVEL = 3 # ERROR -PREPRINT_METRICS_START_DATE = datetime.datetime(2019, 1, 1) - WAFFLE_VALUES_YAML = 'osf/features.yaml' DEFAULT_DRAFT_NODE_TITLE = 'Untitled' USE_COLOR = False