From 77015ff44be1cff80746cdc75da5789e949198c4 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Mon, 19 Jan 2026 17:42:34 -0300 Subject: [PATCH 1/8] =?UTF-8?q?build:=20atualiza=20packtools=20para=20vers?= =?UTF-8?q?=C3=A3o=204.14.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/base.txt b/requirements/base.txt index 7da57b9f..bbc8f8fc 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -87,7 +87,7 @@ pysolr==3.9.0 # https://pypi.org/project/pysolr/ # ------------------------------------------------------------------------------ tornado>=6.5.2 # not directly required, pinned by Snyk to avoid a vulnerability lxml==6.0.2 # https://github.com/lxml/lxml -git+https://git@github.com/scieloorg/packtools@4.13.1#egg=packtools +git+https://git@github.com/scieloorg/packtools@4.14.0#egg=packtools # pymongo # ------------------------------------------------------------------------------ From 809b488b7294939a2f57d1f43aa138bcc2ec81bc Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Mon, 19 Jan 2026 17:42:34 -0300 Subject: [PATCH 2/8] feat(pid_provider): adiciona sps_pkg_name e deprecated_sps_pkg_name na query - Adiciona cached_property sps_pkg_name e deprecated_sps_pkg_name - Expande busca por pkg_name para incluir todas as variantes --- pid_provider/query_params.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/pid_provider/query_params.py b/pid_provider/query_params.py index ef3017b2..57badd93 100644 --- a/pid_provider/query_params.py +++ b/pid_provider/query_params.py @@ -63,9 +63,19 @@ def aop_pid(self): @cached_property def pkg_name(self): - """Nome do pacote do documento.""" + """Nome do pacote do documento, parâmtro usado ao instanciar XMLAdapter""" return self.xml_adapter.pkg_name - + + @cached_property + def sps_pkg_name(self): + """Nome do pacote do documento (deprecated).""" + return self.xml_adapter.sps_pkg_name + + @cached_property + def deprecated_sps_pkg_name(self): + """Nome do pacote do documento (deprecated).""" + return self.xml_adapter.sps_pkg_name + @cached_property def main_doi(self): """DOI principal do documento.""" @@ -176,8 +186,15 @@ def identifier_queries(self): q |= Q(v2=self.aop_pid) | Q(aop_pid=self.aop_pid) # Package name + pkg_names = set() if self.pkg_name: - q |= Q(pkg_name=self.pkg_name) + pkg_names.add(self.pkg_name) + if self.sps_pkg_name: + pkg_names.add(self.sps_pkg_name) + if self.deprecated_sps_pkg_name: + pkg_names.add(self.deprecated_sps_pkg_name) + for pkg_name in pkg_names: + q |= Q(pkg_name=pkg_name) # # DOI principal # if self.main_doi: From b1b82e674a10814c0e14f4f2018f164a9c770ca7 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Mon, 19 Jan 2026 17:50:39 -0300 Subject: [PATCH 3/8] =?UTF-8?q?feat(pid=5Fprovider):=20refatora=20deduplic?= =?UTF-8?q?a=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adiciona find_duplicated_v2 para buscar duplicatas por pid v2 - Refatora deduplicate_items para suportar mark_as_duplicated e deduplicate - Renomeia fix_duplicated_pkg_name para fix_duplicated_items - fix_duplicated_items agora busca por pkg_name, v2 ou other_pid --- pid_provider/models.py | 74 +++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/pid_provider/models.py b/pid_provider/models.py index c0535811..c32b4168 100644 --- a/pid_provider/models.py +++ b/pid_provider/models.py @@ -1315,7 +1315,7 @@ def mark_items_as_invalid(cls, issns): @profile_classmethod def find_duplicated_pkg_names(cls, issns): # Busca em ambos os campos de ISSN - duplicates = ( + return ( cls.objects.filter(Q(issn_print__in=issns) | Q(issn_electronic__in=issns)) .exclude(pkg_name__isnull=True) .exclude(pkg_name="") @@ -1328,25 +1328,31 @@ def find_duplicated_pkg_names(cls, issns): .values("pkg_name") .annotate(count=Count("id")) .filter(count__gt=1) + .values_list("pkg_name", flat=True) ) - return list(set(item["pkg_name"] for item in duplicates)) - - @classmethod + @profile_classmethod - def mark_items_as_duplicated(cls, issns): - ppx_duplicated_pkg_names = PidProviderXML.find_duplicated_pkg_names(issns) - if not ppx_duplicated_pkg_names: - return - cls.objects.filter(pkg_name__in=ppx_duplicated_pkg_names).exclude( - proc_status=choices.PPXML_STATUS_DUPLICATED - ).update( - proc_status=choices.PPXML_STATUS_DUPLICATED, + def find_duplicated_v2(cls, issns): + # Busca em ambos os campos de ISSN + return ( + cls.objects.filter(Q(issn_print__in=issns) | Q(issn_electronic__in=issns)) + .exclude(v2__isnull=True) + .exclude(v2="") + .exclude( + proc_status__in=[ + choices.PPXML_STATUS_DUPLICATED, + choices.PPXML_STATUS_INVALID, + ] + ) + .values("v2") + .annotate(count=Count("id")) + .filter(count__gt=1) + .values_list("v2", flat=True) ) - return ppx_duplicated_pkg_names @classmethod @profile_classmethod - def deduplicate_items(cls, user, issns): + def deduplicate_items(cls, user, issns, mark_as_duplicated=False, deduplicate=False): """ Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos. @@ -1354,26 +1360,50 @@ def deduplicate_items(cls, user, issns): issns: Lista de ISSNs para verificar duplicatas. user: Usuário que está executando a operação. """ + duplicated_v2 = cls.find_duplicated_v2(issns) + if duplicated_v2.exists(): + if mark_as_duplicated: + cls.objects.filter(v2__in=duplicated_v2).exclude( + proc_status=choices.PPXML_STATUS_DUPLICATED + ).update( + proc_status=choices.PPXML_STATUS_DUPLICATED, + ) + if deduplicate: + for v2 in duplicated_v2: + cls.fix_duplicated_items(user, None, v2) + duplicated_pkg_names = cls.find_duplicated_pkg_names(issns) - for pkg_name in duplicated_pkg_names: - cls.fix_duplicated_pkg_name(pkg_name, user) - return duplicated_pkg_names + if duplicated_pkg_names.exists(): + if mark_as_duplicated: + cls.objects.filter(pkg_name__in=duplicated_pkg_names).exclude( + proc_status=choices.PPXML_STATUS_DUPLICATED + ).update( + proc_status=choices.PPXML_STATUS_DUPLICATED, + ) + if deduplicate: + for pkg_name in duplicated_pkg_names: + cls.fix_duplicated_items(user, pkg_name, None) @classmethod @profile_classmethod - def fix_duplicated_pkg_name(cls, pkg_name, user): + def fix_duplicated_items(cls, user, pkg_name, v2): """ Corrige items marcados como PPXML_STATUS_DUPLICATED com base no pkg_name fornecido. Args: - pkg_name: Nome do pacote para verificar duplicatas. user: Usuário que está executando a operação. - + pkg_name: Nome do pacote para verificar duplicatas. + v2: Valor do pid v2 para verificar duplicatas. Returns: int: Número de items atualizados. """ try: - items = cls.objects.filter(pkg_name=pkg_name) + filters = Q() + if v2: + filters |= Q(v2=v2) | Q(other_pid__pid_in_xml=v2) + if pkg_name: + filters |= Q(pkg_name=pkg_name) + items = cls.objects.filter(filters) if items.count() <= 1: return 0 @@ -1409,7 +1439,7 @@ def fix_duplicated_pkg_name(cls, pkg_name, user): UnexpectedEvent.create( exception=exception, exc_traceback=exc_traceback, - action="pid_provider.models.PidProviderXML.fix_duplicated_pkg_name", + action="pid_provider.models.PidProviderXML.fix_duplicated_items", detail=pkg_name, ) From 5e166b21d59737a1590fd4bcc368f6a9c5156c79 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Mon, 19 Jan 2026 17:50:52 -0300 Subject: [PATCH 4/8] refactor(pid_provider): simplifica chamada de deduplicate_items na task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Unifica mark_as_duplicated e deduplicate em única chamada --- pid_provider/tasks.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pid_provider/tasks.py b/pid_provider/tasks.py index e7bd8a23..3cf3ef74 100644 --- a/pid_provider/tasks.py +++ b/pid_provider/tasks.py @@ -281,11 +281,8 @@ def task_fix_journal_pid_provider_xmls_status( if mark_as_invalid: PidProviderXML.mark_items_as_invalid(journal.issns) - if mark_as_duplicated: - PidProviderXML.mark_items_as_duplicated(journal.issns) - - if deduplicate: - PidProviderXML.deduplicate_items(user, journal.issns) + if mark_as_duplicated or deduplicate: + PidProviderXML.deduplicate_items(user, journal.issns, mark_as_duplicated=mark_as_duplicated, deduplicate=deduplicate) return { "status": "success", From 5d3c3659bd7e978d81ff3899839dbce4cd5322c9 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Mon, 19 Jan 2026 17:52:10 -0300 Subject: [PATCH 5/8] =?UTF-8?q?feat(article):=20adiciona=20deduplica=C3=A7?= =?UTF-8?q?=C3=A3o=20por=20pid=5Fv2=20e=20refatora=20m=C3=A9todos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adiciona find_duplicated_pid_v2 para buscar duplicatas por pid v2 - Refatora deduplicate_items para suportar mark_as_duplicated e deduplicate - Remove mark_items_as_duplicated (funcionalidade incorporada em deduplicate_items) - Renomeia fix_duplicated_pkg_name para fix_duplicated_items - find_duplicated_pkg_names retorna QuerySet com values_list --- article/models.py | 79 +++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/article/models.py b/article/models.py index 811ee9c3..2b6be4b1 100755 --- a/article/models.py +++ b/article/models.py @@ -930,7 +930,7 @@ def find_duplicated_pkg_names(cls, journal=None, journal_id=None): params["journal"] = journal if journal_id: params["journal__id"] = journal_id - duplicates = ( + return ( cls.objects.filter(**params) .exclude(sps_pkg_name__isnull=True) .exclude(sps_pkg_name="") @@ -938,11 +938,30 @@ def find_duplicated_pkg_names(cls, journal=None, journal_id=None): .values("sps_pkg_name") .annotate(count=Count("id")) .filter(count__gt=1) + .values_list("sps_pkg_name", flat=True) + ) + + @classmethod + def find_duplicated_pid_v2(cls, journal=None, journal_id=None): + # Busca em ambos os campos de ISSN + params = {} + if journal: + params["journal"] = journal + if journal_id: + params["journal__id"] = journal_id + return ( + cls.objects.filter(**params) + .exclude(pid_v2__isnull=True) + .exclude(pid_v2="") + .exclude(data_status=choices.DATA_STATUS_DUPLICATED) + .values("pid_v2") + .annotate(count=Count("id")) + .filter(count__gt=1) + .values_list("pid_v2", flat=True) ) - return list(item["sps_pkg_name"] for item in duplicates) @classmethod - def mark_items_as_duplicated(cls, journal=None, journal_id=None): + def deduplicate_items(cls, user, journal=None, journal_id=None, mark_as_duplicated=False, deduplicate=False): """ Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos. @@ -950,36 +969,37 @@ def mark_items_as_duplicated(cls, journal=None, journal_id=None): issns: Lista de ISSNs para verificar duplicatas. user: Usuário que está executando a operação. """ - article_duplicated_pkg_names = cls.find_duplicated_pkg_names( + article_duplicated_pid_v2 = cls.find_duplicated_pid_v2( journal, journal_id ) - if not article_duplicated_pkg_names: - return - cls.objects.filter(sps_pkg_name__in=article_duplicated_pkg_names).exclude( - data_status=choices.DATA_STATUS_DUPLICATED - ).update( - data_status=choices.DATA_STATUS_DUPLICATED, - ) - return article_duplicated_pkg_names - - @classmethod - def deduplicate_items(cls, user, journal=None, journal_id=None): - """ - Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos. + if article_duplicated_pid_v2.exists(): + if mark_as_duplicated: + cls.objects.filter(spid_v2__in=article_duplicated_pid_v2).exclude( + data_status=choices.DATA_STATUS_DUPLICATED + ).update( + data_status=choices.DATA_STATUS_DUPLICATED, + ) + if deduplicate: + for pid_v2 in article_duplicated_pid_v2: + cls.fix_duplicated_items(user, None, pid_v2) - Args: - issns: Lista de ISSNs para verificar duplicatas. - user: Usuário que está executando a operação. - """ article_duplicated_pkg_names = cls.find_duplicated_pkg_names( journal, journal_id ) - for pkg_name in article_duplicated_pkg_names: - cls.fix_duplicated_pkg_name(pkg_name, user) + if article_duplicated_pkg_names.exists(): + if mark_as_duplicated: + cls.objects.filter(sps_pkg_name__in=article_duplicated_pkg_names).exclude( + data_status=choices.DATA_STATUS_DUPLICATED + ).update( + data_status=choices.DATA_STATUS_DUPLICATED, + ) + if deduplicate: + for pkg_name in article_duplicated_pkg_names: + cls.fix_duplicated_items(user, pkg_name, None) return article_duplicated_pkg_names @classmethod - def fix_duplicated_pkg_name(cls, pkg_name, user): + def fix_duplicated_items(cls, user, pkg_name, pid_v2): """ Corrige artigos marcados como DATA_STATUS_DUPLICATED com base no pkg_name fornecido. @@ -991,7 +1011,12 @@ def fix_duplicated_pkg_name(cls, pkg_name, user): int: Número de artigos atualizados. """ try: - articles = cls.objects.filter(sps_pkg_name=pkg_name).exclude( + filters = Q() + if pkg_name: + filters |= Q(sps_pkg_name=pkg_name) + if pid_v2: + filters |= Q(pid_v2=pid_v2) + articles = cls.objects.filter(filters).exclude( data_status=choices.DATA_STATUS_DUPLICATED ) if articles.count() <= 1: @@ -1014,8 +1039,8 @@ def fix_duplicated_pkg_name(cls, pkg_name, user): UnexpectedEvent.create( exception=exception, exc_traceback=exc_traceback, - action="article.models.Article.fix_duplicated_pkg_name", - detail=pkg_name, + action="article.models.Article.fix_duplicated_items", + detail=pkg_name or pid_v2, ) From d1cc01724d78fa50e56323c211421a59edfb53c0 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Mon, 19 Jan 2026 17:52:25 -0300 Subject: [PATCH 6/8] refactor(article): simplifica chamada de deduplicate_items na task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Unifica mark_as_duplicated e deduplicate em única chamada --- article/tasks.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/article/tasks.py b/article/tasks.py index e0f8517d..d6036fe8 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -1063,11 +1063,8 @@ def task_fix_journal_articles_status( if mark_as_public: Article.mark_items_as_public(journal_id=journal_id) - if mark_as_duplicated: - Article.mark_items_as_duplicated(journal_id=journal_id) - - if deduplicate: - Article.deduplicate_items(user, journal_id=journal_id) + if mark_as_duplicated or deduplicate: + Article.deduplicate_items(user, journal_id=journal_id, mark_as_duplicated=mark_as_duplicated, deduplicate=deduplicate) return { "status": "success", From 5693777905ab8d626c4374ca2a205dfdc9a5aceb Mon Sep 17 00:00:00 2001 From: Roberta Takenaka <505143+robertatakenaka@users.noreply.github.com> Date: Mon, 19 Jan 2026 18:39:31 -0300 Subject: [PATCH 7/8] Update article/models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- article/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article/models.py b/article/models.py index 2b6be4b1..eaf33388 100755 --- a/article/models.py +++ b/article/models.py @@ -974,7 +974,7 @@ def deduplicate_items(cls, user, journal=None, journal_id=None, mark_as_duplicat ) if article_duplicated_pid_v2.exists(): if mark_as_duplicated: - cls.objects.filter(spid_v2__in=article_duplicated_pid_v2).exclude( + cls.objects.filter(pid_v2__in=article_duplicated_pid_v2).exclude( data_status=choices.DATA_STATUS_DUPLICATED ).update( data_status=choices.DATA_STATUS_DUPLICATED, From 26d28771fbb5f440cb64790b39d188e7b8da1f3e Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 22 Jan 2026 19:00:01 -0300 Subject: [PATCH 8/8] =?UTF-8?q?Corrige=20a=20express=C3=A3o=20de=20busca?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pid_provider/query_params.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pid_provider/query_params.py b/pid_provider/query_params.py index 57badd93..bfb6901b 100644 --- a/pid_provider/query_params.py +++ b/pid_provider/query_params.py @@ -193,8 +193,8 @@ def identifier_queries(self): pkg_names.add(self.sps_pkg_name) if self.deprecated_sps_pkg_name: pkg_names.add(self.deprecated_sps_pkg_name) - for pkg_name in pkg_names: - q |= Q(pkg_name=pkg_name) + if pkg_names: + q |= Q(pkg_name__in=pkg_names) # # DOI principal # if self.main_doi: