histcite
What is HistCite-Python?
HistCite-Python is a Python package for parsing scientific papers' references and recognize citation relathiship between them.
It's originated from the HistCite project, which is no longer maintained by Clarivate for some years. With pandas 2.0 and Graphviz, HistCite-Python has implemented the core functions of HistCite, and extended some new features.
- Support multiple OS systems, Windows, Linux and Mac OS.
- Support multiple literature database, Web of Science, Scopus, CSSCI.
HistCite-Python is an open source project, you can find the source code on GitHub. If you have any questions or suggestions, please submit an issue on GitHub.
Certainly, welcome to contribute to this project.
1""" 2## What is HistCite-Python? 3HistCite-Python is a Python package for parsing scientific papers' references and recognize citation relathiship between them. 4 5It's originated from the [HistCite project](https://support.clarivate.com/ScientificandAcademicResearch/s/article/HistCite-No-longer-in-active-development-or-officially-supported), 6which is no longer maintained by Clarivate for some years. With pandas 2.0 and Graphviz, HistCite-Python has implemented the core functions of HistCite, and extended some new features. 7 8- Support multiple OS systems, Windows, Linux and Mac OS. 9- Support multiple literature database, Web of Science, Scopus, CSSCI. 10 11HistCite-Python is an **open source** project, you can find the source code on [GitHub](https://github.com/doublessay/histcite-python). 12If you have any questions or suggestions, please submit an issue on GitHub. 13 14Certainly, welcome to contribute to this project. 15""" 16 17__version__ = "0.5.2" 18 19from .compute_metrics import ComputeMetrics 20from .network_graph import GraphViz 21from .parse_reference import ParseReference 22from .process_file import ProcessFile 23from .read_file import ( 24 ReadFile, 25 ReadWosFile, 26 ReadCssciFile, 27 ReadScopusFile, 28) 29from .recognize_reference import RecognizeReference 30 31__all__ = [ 32 "ComputeMetrics", 33 "GraphViz", 34 "ParseReference", 35 "ProcessFile", 36 "ReadFile", 37 "ReadWosFile", 38 "ReadCssciFile", 39 "ReadScopusFile", 40 "RecognizeReference", 41]
17class ComputeMetrics: 18 """Compute descriptive statistics of docs. 19 20 Attributes: 21 merged_docs_df: DataFrame of docs merged with citation relationship. 22 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 23 24 """ 25 26 def __init__( 27 self, 28 docs_df: pd.DataFrame, 29 citation_relationship: pd.DataFrame, 30 source_type: Literal["wos", "cssci", "scopus"], 31 ): 32 """ 33 Args: 34 docs_df: DataFrame of docs. 35 citation_relationship: DataFrame of citation relationship. 36 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 37 """ 38 self.merged_docs_df: pd.DataFrame = docs_df.merge( 39 citation_relationship[["doc_index", "LCR", "LCS"]], on="doc_index" 40 ) 41 self.source_type: Literal["wos", "cssci", "scopus"] = source_type 42 43 @staticmethod 44 def generate_df_factory( 45 merged_docs_df: pd.DataFrame, 46 use_cols: list[str], 47 col: str, 48 split_char: Optional[str] = None, 49 str_lower: bool = False, 50 sort_by_col: Literal["Recs", "TLCS", "TGCS"] = "Recs", 51 ) -> pd.DataFrame: 52 """A factory method to generate DataFrame of specific field. 53 You can analyze any field besides the provided functions through this method. 54 55 Args: 56 merged_docs_df: DataFrame of docs merged with citation relationship. 57 use_cols: Columns to use, e.g. `["AU", "LCS", "TC"]`. 58 col: Column to analyze, e.g. `AU`. 59 split_char: Whether to split string, e.g. `; `. Defaults to `None`. 60 str_lower: Whether to convert string to lowercase. Defaults to `False`. 61 sort_by_col: Sort DataFrame by column, `Recs`, `TLCS` or `TGCS`. Defaults to `Recs`. 62 63 Returns: 64 DataFrame of specific field. 65 """ 66 assert col in use_cols, "Argument <col> must be in use_cols" 67 if sort_by_col == "TLCS": 68 assert "LCS" in use_cols, "LCS must be in use_cols before sorting by TLCS" 69 elif sort_by_col == "TGCS": 70 assert "TC" in use_cols, "TC must be in use_cols before sorting by TGCS" 71 72 df = merged_docs_df[use_cols] 73 if split_char: 74 df = df.dropna(subset=[col]) 75 df = df.astype({col: "str"}) 76 if str_lower: 77 df[col] = df[col].str.lower() 78 df[col] = df[col].str.split(split_char) 79 df = df.explode(col) 80 df = df.reset_index(drop=True) 81 82 if "LCS" in use_cols: 83 if "TC" in use_cols: 84 grouped_df = df.groupby(col).agg( 85 {col: "count", "LCS": "sum", "TC": "sum"} 86 ) 87 else: 88 grouped_df = df.groupby(col).agg({col: "count", "LCS": "sum"}) 89 else: 90 grouped_df = df.groupby(col).agg({col: "count"}) 91 92 grouped_df.rename( 93 columns={col: "Recs", "LCS": "TLCS", "TC": "TGCS"}, inplace=True 94 ) 95 # e.g. Andersson, Gerhard (7202645907) 96 if col == "Author full names": 97 grouped_df.index = grouped_df.index.str.replace(r" \(\d+\)", "", regex=True) 98 99 if not sort_by_col: 100 sort_by_col = "Recs" 101 return grouped_df.sort_values(sort_by_col, ascending=False) 102 103 def generate_records_df(self) -> pd.DataFrame: 104 """Return records DataFrame. Similar to `merged_docs_df`.""" 105 if self.source_type in ["wos", "scopus"]: 106 use_cols = [ 107 "AU", 108 "TI", 109 "SO", 110 "PY", 111 "TI", 112 "LCS", 113 "TC", 114 "LCR", 115 "NR", 116 "source file", 117 ] 118 elif self.source_type == "cssci": 119 use_cols = ["AU", "TI", "SO", "PY", "LCS", "LCR", "NR", "source file"] 120 else: 121 raise ValueError("Invalid source type") 122 records_df = self.merged_docs_df[use_cols] 123 if "TC" in use_cols: 124 records_df = records_df.rename(columns={"TC": "GCS"}) 125 if "NR" in use_cols: 126 records_df = records_df.rename(columns={"NR": "GCR"}) 127 return records_df 128 129 def generate_author_df(self) -> pd.DataFrame: 130 """Return author DataFrame.""" 131 if self.source_type == "wos": 132 use_cols = ["AU", "LCS", "TC"] 133 elif self.source_type == "cssci": 134 use_cols = ["AU", "LCS"] 135 elif self.source_type == "scopus": 136 use_cols = ["Author full names", "LCS", "TC"] 137 else: 138 raise ValueError("Invalid source type") 139 return self.generate_df_factory( 140 self.merged_docs_df, use_cols, use_cols[0], "; " 141 ) 142 143 def generate_keyword_df(self) -> pd.DataFrame: 144 """Return keyword DataFrame.""" 145 if self.source_type in ["wos", "scopus"]: 146 use_cols = ["DE", "LCS", "TC"] 147 elif self.source_type == "cssci": 148 use_cols = ["DE", "LCS"] 149 else: 150 raise ValueError("Invalid source type") 151 return self.generate_df_factory(self.merged_docs_df, use_cols, "DE", "; ", True) 152 153 def generate_institution_df(self) -> pd.DataFrame: 154 """Return institution DataFrame. Not support Scopus.""" 155 assert ( 156 self.source_type != "scopus" 157 ), "Scopus is not supported to analyze institution field yet." 158 if self.source_type == "wos": 159 use_cols = ["C3", "LCS", "TC"] 160 elif self.source_type == "cssci": 161 use_cols = ["C3", "LCS"] 162 else: 163 raise ValueError("Invalid source type") 164 return self.generate_df_factory(self.merged_docs_df, use_cols, "C3", "; ") 165 166 def generate_journal_df(self) -> pd.DataFrame: 167 """Return journal DataFrame.""" 168 if self.source_type in ["wos", "scopus"]: 169 use_cols = ["SO", "LCS", "TC"] 170 elif self.source_type == "cssci": 171 use_cols = ["SO", "LCS"] 172 else: 173 raise ValueError("Invalid source type") 174 return self.generate_df_factory(self.merged_docs_df, use_cols, "SO") 175 176 def generate_year_df(self) -> pd.DataFrame: 177 """Return publication year DataFrame. Sort by `PY` ascending.""" 178 use_cols = ["PY"] 179 return self.generate_df_factory( 180 self.merged_docs_df, use_cols, "PY" 181 ).sort_values(by="PY") 182 183 def generate_document_type_df(self) -> pd.DataFrame: 184 """Return document type DataFrame. Not support CSSCI.""" 185 assert self.source_type != "cssci", "CSSCI doesn't have document type info" 186 use_cols = ["DT"] 187 return self.generate_df_factory(self.merged_docs_df, use_cols, "DT") 188 189 # def generate_reference_df(self): 190 # """Generate reference DataFrame. The `local` field means whether the reference is in the downloaded docs.""" 191 # assert self.refs_df is not None, "Param refs_df can't be None" 192 # if self.source_type == "wos": 193 # keys = ["First_AU", "PY", "J9", "VL", "BP", "DI", "local"] 194 # elif self.source_type == "cssci": 195 # keys = ["First_AU", "TI", "SO", "PY", "VL", "local"] 196 # elif self.source_type == "scopus": 197 # keys = ["First_AU", "TI", "SO", "VL", "BP", "EP", "PY", "local"] 198 # else: 199 # raise ValueError("Invalid source type") 200 # refs_df = ( 201 # self.refs_df.groupby(by=keys, dropna=False).size().reset_index(name="Recs") 202 # ) 203 # refs_df.insert(len(refs_df.columns) - 1, "local", refs_df.pop("local")) 204 # return refs_df.sort_values(by="Recs", ascending=False) 205 206 def write2excel(self, save_path: str): 207 """Write all dataframes to an excel file. Each dataframe is a sheet. 208 209 Args: 210 save_path: The path to save the excel file. 211 212 Returns: 213 An excel file with multiple sheets. 214 """ 215 save_folder_path = os.path.dirname(save_path) 216 if not os.path.exists(save_folder_path): 217 os.makedirs(save_folder_path) 218 with pd.ExcelWriter(save_path) as writer: 219 self.generate_records_df().to_excel( 220 writer, sheet_name="Records", index=False 221 ) 222 self.generate_author_df().to_excel(writer, sheet_name="Authors") 223 self.generate_journal_df().to_excel(writer, sheet_name="Journals") 224 self.generate_keyword_df().to_excel(writer, sheet_name="Keywords") 225 self.generate_year_df().to_excel(writer, sheet_name="Years") 226 227 # if self.refs_df is not None: 228 # self.generate_reference_df().to_excel( 229 # writer, sheet_name="Cited References", index=False 230 # ) 231 if self.source_type in ["wos", "cssci"]: 232 self.generate_institution_df().to_excel( 233 writer, sheet_name="Institutions" 234 ) 235 if self.source_type in ["wos", "scopus"]: 236 self.generate_document_type_df().to_excel( 237 writer, sheet_name="Document Type" 238 )
Compute descriptive statistics of docs.
Attributes:
- merged_docs_df: DataFrame of docs merged with citation relationship.
- source_type: Source type of docs,
wos
,cssci
orscopus
.
26 def __init__( 27 self, 28 docs_df: pd.DataFrame, 29 citation_relationship: pd.DataFrame, 30 source_type: Literal["wos", "cssci", "scopus"], 31 ): 32 """ 33 Args: 34 docs_df: DataFrame of docs. 35 citation_relationship: DataFrame of citation relationship. 36 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 37 """ 38 self.merged_docs_df: pd.DataFrame = docs_df.merge( 39 citation_relationship[["doc_index", "LCR", "LCS"]], on="doc_index" 40 ) 41 self.source_type: Literal["wos", "cssci", "scopus"] = source_type
Arguments:
- docs_df: DataFrame of docs.
- citation_relationship: DataFrame of citation relationship.
- source_type: Source type of docs,
wos
,cssci
orscopus
.
43 @staticmethod 44 def generate_df_factory( 45 merged_docs_df: pd.DataFrame, 46 use_cols: list[str], 47 col: str, 48 split_char: Optional[str] = None, 49 str_lower: bool = False, 50 sort_by_col: Literal["Recs", "TLCS", "TGCS"] = "Recs", 51 ) -> pd.DataFrame: 52 """A factory method to generate DataFrame of specific field. 53 You can analyze any field besides the provided functions through this method. 54 55 Args: 56 merged_docs_df: DataFrame of docs merged with citation relationship. 57 use_cols: Columns to use, e.g. `["AU", "LCS", "TC"]`. 58 col: Column to analyze, e.g. `AU`. 59 split_char: Whether to split string, e.g. `; `. Defaults to `None`. 60 str_lower: Whether to convert string to lowercase. Defaults to `False`. 61 sort_by_col: Sort DataFrame by column, `Recs`, `TLCS` or `TGCS`. Defaults to `Recs`. 62 63 Returns: 64 DataFrame of specific field. 65 """ 66 assert col in use_cols, "Argument <col> must be in use_cols" 67 if sort_by_col == "TLCS": 68 assert "LCS" in use_cols, "LCS must be in use_cols before sorting by TLCS" 69 elif sort_by_col == "TGCS": 70 assert "TC" in use_cols, "TC must be in use_cols before sorting by TGCS" 71 72 df = merged_docs_df[use_cols] 73 if split_char: 74 df = df.dropna(subset=[col]) 75 df = df.astype({col: "str"}) 76 if str_lower: 77 df[col] = df[col].str.lower() 78 df[col] = df[col].str.split(split_char) 79 df = df.explode(col) 80 df = df.reset_index(drop=True) 81 82 if "LCS" in use_cols: 83 if "TC" in use_cols: 84 grouped_df = df.groupby(col).agg( 85 {col: "count", "LCS": "sum", "TC": "sum"} 86 ) 87 else: 88 grouped_df = df.groupby(col).agg({col: "count", "LCS": "sum"}) 89 else: 90 grouped_df = df.groupby(col).agg({col: "count"}) 91 92 grouped_df.rename( 93 columns={col: "Recs", "LCS": "TLCS", "TC": "TGCS"}, inplace=True 94 ) 95 # e.g. Andersson, Gerhard (7202645907) 96 if col == "Author full names": 97 grouped_df.index = grouped_df.index.str.replace(r" \(\d+\)", "", regex=True) 98 99 if not sort_by_col: 100 sort_by_col = "Recs" 101 return grouped_df.sort_values(sort_by_col, ascending=False)
A factory method to generate DataFrame of specific field. You can analyze any field besides the provided functions through this method.
Arguments:
- merged_docs_df: DataFrame of docs merged with citation relationship.
- use_cols: Columns to use, e.g.
["AU", "LCS", "TC"]
. - col: Column to analyze, e.g.
AU
. - split_char: Whether to split string, e.g.
;
. Defaults toNone
. - str_lower: Whether to convert string to lowercase. Defaults to
False
. - sort_by_col: Sort DataFrame by column,
Recs
,TLCS
orTGCS
. Defaults toRecs
.
Returns:
DataFrame of specific field.
103 def generate_records_df(self) -> pd.DataFrame: 104 """Return records DataFrame. Similar to `merged_docs_df`.""" 105 if self.source_type in ["wos", "scopus"]: 106 use_cols = [ 107 "AU", 108 "TI", 109 "SO", 110 "PY", 111 "TI", 112 "LCS", 113 "TC", 114 "LCR", 115 "NR", 116 "source file", 117 ] 118 elif self.source_type == "cssci": 119 use_cols = ["AU", "TI", "SO", "PY", "LCS", "LCR", "NR", "source file"] 120 else: 121 raise ValueError("Invalid source type") 122 records_df = self.merged_docs_df[use_cols] 123 if "TC" in use_cols: 124 records_df = records_df.rename(columns={"TC": "GCS"}) 125 if "NR" in use_cols: 126 records_df = records_df.rename(columns={"NR": "GCR"}) 127 return records_df
Return records DataFrame. Similar to merged_docs_df
.
143 def generate_keyword_df(self) -> pd.DataFrame: 144 """Return keyword DataFrame.""" 145 if self.source_type in ["wos", "scopus"]: 146 use_cols = ["DE", "LCS", "TC"] 147 elif self.source_type == "cssci": 148 use_cols = ["DE", "LCS"] 149 else: 150 raise ValueError("Invalid source type") 151 return self.generate_df_factory(self.merged_docs_df, use_cols, "DE", "; ", True)
Return keyword DataFrame.
153 def generate_institution_df(self) -> pd.DataFrame: 154 """Return institution DataFrame. Not support Scopus.""" 155 assert ( 156 self.source_type != "scopus" 157 ), "Scopus is not supported to analyze institution field yet." 158 if self.source_type == "wos": 159 use_cols = ["C3", "LCS", "TC"] 160 elif self.source_type == "cssci": 161 use_cols = ["C3", "LCS"] 162 else: 163 raise ValueError("Invalid source type") 164 return self.generate_df_factory(self.merged_docs_df, use_cols, "C3", "; ")
Return institution DataFrame. Not support Scopus.
166 def generate_journal_df(self) -> pd.DataFrame: 167 """Return journal DataFrame.""" 168 if self.source_type in ["wos", "scopus"]: 169 use_cols = ["SO", "LCS", "TC"] 170 elif self.source_type == "cssci": 171 use_cols = ["SO", "LCS"] 172 else: 173 raise ValueError("Invalid source type") 174 return self.generate_df_factory(self.merged_docs_df, use_cols, "SO")
Return journal DataFrame.
176 def generate_year_df(self) -> pd.DataFrame: 177 """Return publication year DataFrame. Sort by `PY` ascending.""" 178 use_cols = ["PY"] 179 return self.generate_df_factory( 180 self.merged_docs_df, use_cols, "PY" 181 ).sort_values(by="PY")
Return publication year DataFrame. Sort by PY
ascending.
183 def generate_document_type_df(self) -> pd.DataFrame: 184 """Return document type DataFrame. Not support CSSCI.""" 185 assert self.source_type != "cssci", "CSSCI doesn't have document type info" 186 use_cols = ["DT"] 187 return self.generate_df_factory(self.merged_docs_df, use_cols, "DT")
Return document type DataFrame. Not support CSSCI.
206 def write2excel(self, save_path: str): 207 """Write all dataframes to an excel file. Each dataframe is a sheet. 208 209 Args: 210 save_path: The path to save the excel file. 211 212 Returns: 213 An excel file with multiple sheets. 214 """ 215 save_folder_path = os.path.dirname(save_path) 216 if not os.path.exists(save_folder_path): 217 os.makedirs(save_folder_path) 218 with pd.ExcelWriter(save_path) as writer: 219 self.generate_records_df().to_excel( 220 writer, sheet_name="Records", index=False 221 ) 222 self.generate_author_df().to_excel(writer, sheet_name="Authors") 223 self.generate_journal_df().to_excel(writer, sheet_name="Journals") 224 self.generate_keyword_df().to_excel(writer, sheet_name="Keywords") 225 self.generate_year_df().to_excel(writer, sheet_name="Years") 226 227 # if self.refs_df is not None: 228 # self.generate_reference_df().to_excel( 229 # writer, sheet_name="Cited References", index=False 230 # ) 231 if self.source_type in ["wos", "cssci"]: 232 self.generate_institution_df().to_excel( 233 writer, sheet_name="Institutions" 234 ) 235 if self.source_type in ["wos", "scopus"]: 236 self.generate_document_type_df().to_excel( 237 writer, sheet_name="Document Type" 238 )
Write all dataframes to an excel file. Each dataframe is a sheet.
Arguments:
- save_path: The path to save the excel file.
Returns:
An excel file with multiple sheets.
7class GraphViz: 8 """Generate dot file for Graphviz. Support citation network of multi docs and specific doc. 9 10 Attributes: 11 empty_year_index: Index of docs without `PY` info. These docs will be removed from citation network if `generate_dot_file.show_timeline = True`. 12 merged_docs_df: DataFrame of docs with citation relationship. 13 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 14 """ 15 16 def __init__( 17 self, 18 docs_df: pd.DataFrame, 19 citation_relationship: pd.DataFrame, 20 source_type: Literal["wos", "cssci", "scopus"], 21 ): 22 """ 23 Args: 24 docs_df: DataFrame of docs. 25 citation_relationship: DataFrame of citation relationship. 26 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 27 """ 28 self.empty_year_index: pd.Index = docs_df[docs_df["PY"].isna()].index 29 self.merged_docs_df: pd.DataFrame = docs_df.merge( 30 citation_relationship, 31 left_index=True, 32 right_index=True, 33 suffixes=(None, "_y"), 34 ).drop(columns=["doc_index_y"]) 35 self.source_type: Literal["wos", "cssci", "scopus"] = source_type 36 37 @staticmethod 38 def _generate_edge( 39 doc_index: int, 40 related_doc_index_list: Union[str, list[int]], 41 citation_type: Literal["cited", "citing"], 42 ) -> set[tuple[int, int]]: 43 if isinstance(related_doc_index_list, str): 44 related_doc_index_list = [int(i) for i in related_doc_index_list.split(";")] 45 if citation_type == "cited": 46 return {(doc_index, ref) for ref in related_doc_index_list} 47 else: 48 return {(citation, doc_index) for citation in related_doc_index_list} 49 50 def _generate_edge_set_from_specific_doc( 51 self, 52 doc_index: int, 53 edge_type: Literal["cited", "citing"], 54 ): 55 def pipeline(doc_index: int): 56 if edge_type == "cited": 57 cell = self.merged_docs_df.loc[doc_index, "cited_doc_index"] 58 else: 59 cell = self.merged_docs_df.loc[doc_index, "citing_doc_index"] 60 if isinstance(cell, str): 61 related_doc_index = [int(i) for i in cell.split(";")] 62 pending_doc_index.extend(related_doc_index) 63 if edge_type == "cited": 64 edge_set.update( 65 self._generate_edge(doc_index, related_doc_index, "cited") 66 ) 67 else: 68 edge_set.update( 69 self._generate_edge(doc_index, related_doc_index, "citing") 70 ) 71 72 edge_set: set[tuple[int, int]] = set() 73 pending_doc_index: list[int] = [] 74 pipeline(doc_index) 75 while pending_doc_index: 76 current_doc_index = pending_doc_index.pop() 77 pipeline(current_doc_index) 78 return edge_set 79 80 def _generate_edge_set_from_multi_doc(self, doc_indices: list[int]): 81 edge_set: set[tuple[int, int]] = set() 82 for idx in doc_indices: 83 cited_doc_index = self.merged_docs_df.loc[idx, "cited_doc_index"] 84 citing_doc_index = self.merged_docs_df.loc[idx, "citing_doc_index"] 85 if isinstance(cited_doc_index, str): 86 edge_set.update(self._generate_edge(idx, cited_doc_index, "cited")) 87 if isinstance(citing_doc_index, str): 88 edge_set.update(self._generate_edge(idx, citing_doc_index, "citing")) 89 edge_set = { 90 (edge[0], edge[1]) 91 for edge in edge_set 92 if edge[0] in doc_indices and edge[1] in doc_indices 93 } 94 return edge_set 95 96 def _generate_edge_set(self) -> dict[int, list[int]]: 97 if len(self.doc_indices) > 1: 98 edge_set = self._generate_edge_set_from_multi_doc(self.doc_indices) 99 else: 100 initial_doc_index = self.doc_indices[0] 101 if self.edge_type == "cited": 102 edge_set = self._generate_edge_set_from_specific_doc( 103 initial_doc_index, "cited" 104 ) 105 elif self.edge_type == "citing": 106 edge_set = self._generate_edge_set_from_specific_doc( 107 initial_doc_index, "citing" 108 ) 109 elif self.edge_type is None: 110 edge_set = self._generate_edge_set_from_specific_doc( 111 initial_doc_index, "cited" 112 ) 113 edge_set.update( 114 self._generate_edge_set_from_specific_doc( 115 initial_doc_index, "citing" 116 ) 117 ) 118 else: 119 raise ValueError( 120 'Argument <edge_type> must be one of "cited", "citing" or None' 121 ) 122 123 # Drop nodes without PY info 124 if len(self.empty_year_index) > 0 and self.show_timeline is True: 125 edge_set = { 126 (edge[0], edge[1]) 127 for edge in edge_set 128 if edge[0] not in self.empty_year_index 129 and edge[1] not in self.empty_year_index 130 } 131 132 # Build node_list according to edges 133 source_node = set([i for i, _ in edge_set]) 134 target_node = set([j for _, j in edge_set]) 135 node_list = sorted(source_node | target_node) 136 self.node_list = node_list 137 138 edge_dict = {i: [] for i in sorted(source_node)} 139 for edge in edge_set: 140 edge_dict[edge[0]].append(edge[1]) 141 return edge_dict 142 143 def _obtain_groups(self): 144 """Obtain groups of doc_index by year.""" 145 year_series = self.merged_docs_df.loc[self.node_list, "PY"] 146 year_groups = year_series.groupby(year_series).groups.items() 147 year_list = [i[0] for i in year_groups] 148 grouped_doc_index = [list(i[1]) for i in year_groups] 149 if self.show_timeline is True: 150 for idx, year in enumerate(year_list): 151 grouped_doc_index[idx].insert(0, year) 152 return grouped_doc_index, year_list 153 154 def generate_dot_file( 155 self, 156 doc_indices: Union[list[int], int], 157 edge_type: Optional[Literal["cited", "citing"]] = None, 158 show_timeline: bool = True, 159 ) -> str: 160 """ 161 Args: 162 doc_indices: Specific doc_index or list of doc_index. If list, only show edges between these doc_index. 163 edge_type: Only for specific doc_index. It can be `cited`, `citing` or `None`. If `None`, show both `cited` and `citing` edges. Default `None`. 164 show_timeline: Whether show timeline. In some cases, timeline may be disorderly, so you can set it to `False`. Default `True`. 165 166 Returns: 167 Dot file content. 168 """ 169 if isinstance(doc_indices, list) and len(doc_indices) > 1: 170 assert ( 171 edge_type is None 172 ), "Argument <edge_type> should be None if <doc_indices> contains >1 elements." 173 self.doc_indices = doc_indices 174 elif isinstance(doc_indices, int): 175 assert ( 176 doc_indices in self.merged_docs_df.index 177 ), "Don't select doc_index not in docs_df." 178 assert ( 179 doc_indices not in self.empty_year_index 180 ), "Don't select doc_index without <PY> info." 181 self.doc_indices = [doc_indices] 182 self.edge_type = edge_type 183 self.show_timeline = show_timeline 184 185 edge_dict = self._generate_edge_set() 186 grouped_doc_index, year_list = self._obtain_groups() 187 188 dot_groups = [ 189 f'\t{{rank=same; {" ".join([str(i) for i in group_index])}}};\n' 190 for group_index in grouped_doc_index 191 ] 192 dot_edge_list = [ 193 f"\t{source} -> " 194 + "{ " 195 + " ".join([str(i) for i in edge_dict[source]]) 196 + " };\n" 197 for source in edge_dict.keys() 198 ] 199 200 if self.show_timeline is True: 201 reversed_year_list = year_list[::-1] 202 year_edge_list = [ 203 (year, reversed_year_list[idx + 1]) 204 for idx, year in enumerate(reversed_year_list) 205 if idx < len(reversed_year_list) - 1 206 ] 207 dot_year_node_list = [ 208 f'\t{year} [ shape="plaintext" ];\n' for year in year_list 209 ] 210 dot_year_edge_list = [ 211 f"\t{edge[0]} -> {edge[1]} [ style = invis ];\n" 212 for edge in year_edge_list 213 ] 214 else: 215 dot_year_node_list, dot_year_edge_list = [], [] 216 217 dot_text = "digraph metadata{\n\trankdir = BT;\n" 218 for dot_group in dot_groups: 219 dot_text += dot_group 220 221 for dot_year_node in dot_year_node_list: 222 dot_text += dot_year_node 223 224 for dot_year_edge in dot_year_edge_list: 225 dot_text += dot_year_edge 226 227 for dot_edge in dot_edge_list: 228 dot_text += dot_edge 229 dot_text += "}" 230 return dot_text 231 232 def generate_graph_node_info(self) -> pd.DataFrame: 233 """Generate dataframe of graph node info. Columns differ according to `source_type`. 234 235 Returns: 236 Dataframe of graph node info. 237 """ 238 if self.source_type == "wos": 239 use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"] 240 elif self.source_type == "cssci": 241 use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS"] 242 elif self.source_type == "scopus": 243 use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"] 244 else: 245 raise ValueError("invalid source type") 246 graph_node_info = self.merged_docs_df.loc[self.node_list, use_cols] 247 if "TC" in graph_node_info.columns: 248 graph_node_info.rename(columns={"TC": "GCS"}, inplace=True) 249 return graph_node_info 250 251 def _export_graph_node_info(self, file_path: str): 252 self.generate_graph_node_info().to_excel(file_path, index=False)
Generate dot file for Graphviz. Support citation network of multi docs and specific doc.
Attributes:
- empty_year_index: Index of docs without
PY
info. These docs will be removed from citation network ifgenerate_dot_file.show_timeline = True
. - merged_docs_df: DataFrame of docs with citation relationship.
- source_type: Source type of docs,
wos
,cssci
orscopus
.
16 def __init__( 17 self, 18 docs_df: pd.DataFrame, 19 citation_relationship: pd.DataFrame, 20 source_type: Literal["wos", "cssci", "scopus"], 21 ): 22 """ 23 Args: 24 docs_df: DataFrame of docs. 25 citation_relationship: DataFrame of citation relationship. 26 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 27 """ 28 self.empty_year_index: pd.Index = docs_df[docs_df["PY"].isna()].index 29 self.merged_docs_df: pd.DataFrame = docs_df.merge( 30 citation_relationship, 31 left_index=True, 32 right_index=True, 33 suffixes=(None, "_y"), 34 ).drop(columns=["doc_index_y"]) 35 self.source_type: Literal["wos", "cssci", "scopus"] = source_type
Arguments:
- docs_df: DataFrame of docs.
- citation_relationship: DataFrame of citation relationship.
- source_type: Source type of docs,
wos
,cssci
orscopus
.
154 def generate_dot_file( 155 self, 156 doc_indices: Union[list[int], int], 157 edge_type: Optional[Literal["cited", "citing"]] = None, 158 show_timeline: bool = True, 159 ) -> str: 160 """ 161 Args: 162 doc_indices: Specific doc_index or list of doc_index. If list, only show edges between these doc_index. 163 edge_type: Only for specific doc_index. It can be `cited`, `citing` or `None`. If `None`, show both `cited` and `citing` edges. Default `None`. 164 show_timeline: Whether show timeline. In some cases, timeline may be disorderly, so you can set it to `False`. Default `True`. 165 166 Returns: 167 Dot file content. 168 """ 169 if isinstance(doc_indices, list) and len(doc_indices) > 1: 170 assert ( 171 edge_type is None 172 ), "Argument <edge_type> should be None if <doc_indices> contains >1 elements." 173 self.doc_indices = doc_indices 174 elif isinstance(doc_indices, int): 175 assert ( 176 doc_indices in self.merged_docs_df.index 177 ), "Don't select doc_index not in docs_df." 178 assert ( 179 doc_indices not in self.empty_year_index 180 ), "Don't select doc_index without <PY> info." 181 self.doc_indices = [doc_indices] 182 self.edge_type = edge_type 183 self.show_timeline = show_timeline 184 185 edge_dict = self._generate_edge_set() 186 grouped_doc_index, year_list = self._obtain_groups() 187 188 dot_groups = [ 189 f'\t{{rank=same; {" ".join([str(i) for i in group_index])}}};\n' 190 for group_index in grouped_doc_index 191 ] 192 dot_edge_list = [ 193 f"\t{source} -> " 194 + "{ " 195 + " ".join([str(i) for i in edge_dict[source]]) 196 + " };\n" 197 for source in edge_dict.keys() 198 ] 199 200 if self.show_timeline is True: 201 reversed_year_list = year_list[::-1] 202 year_edge_list = [ 203 (year, reversed_year_list[idx + 1]) 204 for idx, year in enumerate(reversed_year_list) 205 if idx < len(reversed_year_list) - 1 206 ] 207 dot_year_node_list = [ 208 f'\t{year} [ shape="plaintext" ];\n' for year in year_list 209 ] 210 dot_year_edge_list = [ 211 f"\t{edge[0]} -> {edge[1]} [ style = invis ];\n" 212 for edge in year_edge_list 213 ] 214 else: 215 dot_year_node_list, dot_year_edge_list = [], [] 216 217 dot_text = "digraph metadata{\n\trankdir = BT;\n" 218 for dot_group in dot_groups: 219 dot_text += dot_group 220 221 for dot_year_node in dot_year_node_list: 222 dot_text += dot_year_node 223 224 for dot_year_edge in dot_year_edge_list: 225 dot_text += dot_year_edge 226 227 for dot_edge in dot_edge_list: 228 dot_text += dot_edge 229 dot_text += "}" 230 return dot_text
Arguments:
- doc_indices: Specific doc_index or list of doc_index. If list, only show edges between these doc_index.
- edge_type: Only for specific doc_index. It can be
cited
,citing
orNone
. IfNone
, show bothcited
andciting
edges. DefaultNone
. - show_timeline: Whether show timeline. In some cases, timeline may be disorderly, so you can set it to
False
. DefaultTrue
.
Returns:
Dot file content.
232 def generate_graph_node_info(self) -> pd.DataFrame: 233 """Generate dataframe of graph node info. Columns differ according to `source_type`. 234 235 Returns: 236 Dataframe of graph node info. 237 """ 238 if self.source_type == "wos": 239 use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"] 240 elif self.source_type == "cssci": 241 use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS"] 242 elif self.source_type == "scopus": 243 use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"] 244 else: 245 raise ValueError("invalid source type") 246 graph_node_info = self.merged_docs_df.loc[self.node_list, use_cols] 247 if "TC" in graph_node_info.columns: 248 graph_node_info.rename(columns={"TC": "GCS"}, inplace=True) 249 return graph_node_info
Generate dataframe of graph node info. Columns differ according to source_type
.
Returns:
Dataframe of graph node info.
51class ParseReference: 52 @staticmethod 53 def _parse_wos_ref( 54 ref: str, doc_index: Optional[int] = None 55 ) -> Optional[dict[str, Optional[str]]]: 56 """Parse reference of Web of Science. 57 58 Args: 59 ref: A reference string. e.g. `CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018` 60 doc_index: doc_index to which the reference belongs. Default is None. 61 62 Returns: 63 A dict of reference fields. 64 """ 65 # Refs contain another language except english or AU is anonymous 66 if re.search(r"[\[\]]", ref): 67 return None 68 69 # Don't parse patent 70 if "Patent No." in ref: 71 return None 72 73 AU, PY, J9, VL, BP, DI = None, None, None, None, None, None 74 75 if ", DOI " in ref: 76 # Contain only one DOI 77 if "DOI [" not in ref: 78 DI_match = re.search(r"DOI (10.*)$", ref) 79 DI = DI_match[1] if DI_match else None 80 # Contain multi DOIs 81 else: 82 DI_match = re.search(r"DOI \[(.*)\]", ref) 83 DI = DI_match[1] if DI_match else None 84 ref = re.sub(r", DOI.*", "", ref) 85 86 BP_match = re.search(r", [Pp]([A-Za-z]?\d+)$", ref) 87 if BP_match: 88 BP = BP_match[1] 89 ref = re.sub(r", [Pp][A-Za-z]?\d+", "", ref) 90 91 ref = re.sub(r"[,\.] PROCEEDINGS(?=, )", "", ref, flags=re.I) 92 if VL_match := re.search(r", V([\d-]+)$", ref): 93 VL = VL_match[1] 94 sub_pattern = r", V[\d-]+$" 95 96 elif re.search(r", VOL[s\.]? ", ref, re.I): 97 VL_match = re.search(r", VOL[s\.]? ([\w\- ]+)$", ref, re.I) 98 VL = VL_match[1] if VL_match else None 99 sub_pattern = r", V[Oo][Ll].*" 100 101 elif VL_match := re.search(r"(?<=[A-Z\.]), V([\w\. ]+)$", ref): 102 VL = VL_match[1] 103 sub_pattern = r"(?<=[A-Z\.]), V[\w\. ]+$" 104 105 else: 106 sub_pattern = None 107 108 if sub_pattern: 109 ref = re.sub(sub_pattern, "", ref) 110 111 dot_count = ref.count(", ") 112 if dot_count == 2: 113 AU, PY, J9 = ref.split(", ") 114 elif dot_count > 2: 115 PY_pattern = r", (\d{4}), " 116 if re.search(PY_pattern, ref): 117 AU, PY, J9 = re.split(PY_pattern, ref, 1) 118 else: 119 return None 120 121 if DI: 122 DI = DI.lower() 123 if len(re.findall(", ", DI)) == 1: 124 try: 125 DI1, DI2 = DI.replace("doi ", "").split(", ") 126 except: 127 return None 128 if DI1 == DI2: 129 DI = DI1 130 else: 131 DI = None 132 133 if PY and not re.match(r"^\d{4}$", PY): 134 PY = None 135 136 result = asdict(WosField(AU, PY, J9, VL, BP, DI)) 137 if doc_index is not None: 138 result["doc_index"] = doc_index 139 return result 140 141 @staticmethod 142 def _parse_cssci_ref( 143 ref: str, doc_index: Optional[int] = None 144 ) -> Optional[dict[str, Optional[str]]]: 145 """Parse reference of CSSCI. Only parse reference in Chinese language. 146 147 Args: 148 ref: A reference string. e.g. `1.严栋.基于物联网的智慧图书馆.图书馆学刊.2010.32(7)` 149 doc_index: doc_index to which the reference belongs. Default is None. 150 151 Returns: 152 A dict of reference fields. 153 """ 154 dot_pattern = re.compile( 155 r"(?<!\d)\.(?!\d)|(?<=\d)\.(?!\d)|(?<!\d)\.(?=\d)|(?<=\d{4})\.(?=\d)|(?<=\d)\.(?=\d{4})" 156 ) 157 158 if re.search(r"[\u4e00-\u9fa5]", ref): 159 dot_count = len(dot_pattern.findall(ref)) 160 161 if re.search(r"[^\d]\.{2,}", ref): 162 return None 163 164 # Dissertation 165 elif ":学位论文." in ref: 166 try: 167 _, AU, TI, other = ref.split(".") 168 except: 169 return None 170 else: 171 TI = TI.replace(":学位论文", "") 172 SO, PY = other.split(",") 173 PY = PY.split(":")[0] 174 raw_result = CssciField(AU, TI, SO, PY, None) 175 176 # Country standard 177 elif "GB/T" in ref: 178 if ref[-3:] == "出版社": 179 _, AU, other = ref.split(".", 2) 180 TI, SO = other.rsplit(".", 1) 181 raw_result = CssciField(AU, TI, SO, None, None) 182 else: 183 _, AU, TI = ref.split(".", 2) 184 raw_result = CssciField(AU, TI, None, None, None) 185 186 # Standard 187 elif re.search(r":DB\d{2}/T", ref): 188 _, AU, other = ref.split(".", 2) 189 TI, PY = other.rsplit(".", 1) 190 raw_result = CssciField(AU, TI, None, PY, None) 191 192 # Newspaper 193 elif re.search(r"\.\d{1,2}\.\d{1,2}(?:\(|$)", ref): 194 try: 195 _, AU, TI, SO, other = re.split(dot_pattern, ref, 4) 196 except: 197 return None 198 else: 199 raw_result = CssciField(AU, TI, SO, None, None) 200 201 # Patent1 202 elif re.search(r"\.CN\d{9}[A-Z]$", ref): 203 TI = ref.split(".", 1)[1] 204 raw_result = CssciField(None, TI, None, None, None) 205 # Patent2 206 elif re.search(r"^\d+\.一种", ref): 207 date_pattern = re.compile(r"\d{4}\-\d{1,2}\-\d{1,2}") 208 TI = ref.split(".", 1)[1] 209 date = date_pattern.search(ref) 210 if date: 211 PY = date[0].split("-")[0] 212 else: 213 PY = None 214 TI = date_pattern.sub("", TI).strip(".()") 215 raw_result = CssciField(None, TI, None, PY, None) 216 217 # Network resource 218 elif re.search(r"\.\d{4}$", ref): 219 if dot_count == 3: 220 _, AU, TI, PY = re.split(dot_pattern, ref) 221 elif dot_count == 4: 222 _, AU, TI, SO, PY = re.split(dot_pattern, ref) 223 else: 224 return None 225 raw_result = CssciField(AU, TI, None, PY, None) 226 227 # Journal1 228 elif dot_count == 5: 229 _, AU, TI, SO, PY, VL = re.split(dot_pattern, ref) 230 raw_result = CssciField(AU, TI, SO, PY, VL) 231 # Journal2 232 elif dot_count == 4: 233 _, AU, TI, SO, _ = re.split(dot_pattern, ref) 234 raw_result = CssciField(AU, TI, SO, None, None) 235 236 # Book 237 elif dot_count == 3: 238 _, AU, TI, SO = re.split(dot_pattern, ref) 239 raw_result = CssciField(AU, TI, SO, None, None) 240 241 elif dot_count == 2: 242 _, AU, TI = re.split(dot_pattern, ref) 243 raw_result = CssciField(AU, TI, None, None, None) 244 245 elif dot_count == 1: 246 _, TI = re.split(dot_pattern, ref) 247 raw_result = CssciField(None, TI, None, None, None) 248 else: 249 return None 250 251 result = asdict(raw_result) 252 if doc_index is not None: 253 result["doc_index"] = doc_index 254 return result 255 256 @staticmethod 257 def _parse_scopus_ref( 258 ref: str, doc_index: Optional[int] = None 259 ) -> Optional[dict[str, Optional[str]]]: 260 """Parse reference of Scopus. 261 262 Args: 263 ref: A reference string. e.g. `Negri E, Fumagalli L, Macchi M., A Review of the Roles of Digital Twin in CPS-based Production Systems, Procedia Manufacturing, 11, pp. 939-948, (2017)` 264 doc_index: doc_index to which the reference belongs. Default is None. 265 266 Returns: 267 A dict of reference fields. 268 """ 269 if re.search(r"^[^A-Z\*\']", ref): 270 return None 271 272 if re.search(r"[\[\]]", ref): 273 return None 274 275 if ref.count(", ") < 2: 276 return None 277 278 # Publication year 279 PY_match = re.search(r", \((\d{4})\)$", ref) 280 if PY_match: 281 PY = PY_match[1] 282 ref = ref.rsplit(", ", 1)[0] 283 else: 284 return None 285 286 First_AU, TI, SO, VL, IS, BP, EP = None, None, None, None, None, None, None 287 288 # remove version info 289 ref = re.sub(r", version [\d\.]+(?=,)", "", ref, flags=re.I) 290 291 # remove doi info 292 ref = re.sub(r", doi:.*(?=,|$)", "", ref, flags=re.I) 293 294 # remove retrieval info 295 ref = re.sub(r"[\.,] Retrieved.*(?=,)", "", ref, flags=re.I) 296 ref = re.sub(r", Available from:(?=,)", "", ref, flags=re.I) 297 298 # Page number 299 if PP_match := re.search(r"(?:, | \()[Pp]{2}[\.,] ([\w\-]+)\)?", ref): 300 PP = PP_match[1] 301 try: 302 BP, EP = re.split(r"-", PP, 1) 303 except: 304 BP, EP = None, None 305 ref = re.sub(r"(?:, | \()[Pp]{2}.*", "", ref) 306 307 # Volume and Issue 308 if VL_IS_match := re.search(r", (\d+\s?[A-Za-z]*, [\w\s\-\.\–]+)$", ref): 309 VL, IS = VL_IS_match[1].split(", ") 310 ref = ref.rsplit(", ", 2)[0] 311 312 elif IS_match := re.search(r", ([\w-]* ?suppl\.? ?[\w-]*)$", ref, re.I): 313 IS = IS_match[1] 314 ref = ref.rsplit(", ", 1)[0] 315 316 elif IS_match := re.search(r", (\d* ?PART\.? [A-Z\d]+)$", ref, re.I): 317 IS = IS_match[1] 318 ref = ref.rsplit(", ", 1)[0] 319 320 elif IS_match := re.search(r", ([Nn]o\. \d+)$", ref): 321 IS = IS_match[1] 322 ref = ref.rsplit(", ", 1)[0] 323 324 if VL_match := re.search(r", (\d+)$", ref): 325 VL = VL_match[1] 326 ref = ref.rsplit(", ", 1)[0] 327 328 elif VL_match := re.search(r", ([Vv]ol\. [\w\s\.:]+)$", ref): 329 VL = VL_match[1] 330 ref = ref.rsplit(", ", 1)[0] 331 332 # Author 333 full_name_pattern = r"^(?:[a-zA-Z][a-zA-Z\-\.\']*\s)+[A-Z][a-zA-Z\-\.\']*(, |$)" 334 if re.search(r"Et al\.", ref, flags=re.I): 335 First_AU = ref.split(", ")[0] 336 ref = re.sub(r"^.*Et al\.,?\s?", "", ref, flags=re.I) 337 338 elif "., " in ref: 339 AU = ref.rsplit("., ", 1)[0] 340 if "," in AU: 341 First_AU = AU.split(", ")[0] 342 else: 343 First_AU = AU + "." 344 ref = ref.replace(f"{AU}., ", "") 345 346 elif re.search(r"^(?:[A-Z][a-zA-Z]*\s)+[A-Z][a-zA-Z]*(?=, )", ref): 347 First_AU = ref.split(", ", 1)[0] 348 ref = ref.replace(f"{First_AU}, ", "") 349 350 elif re.search(r"^[A-Z-]+, (?=[A-Z])", ref): 351 First_AU = ref.split(", ", 1)[0] 352 ref = ref.replace(f"{First_AU}, ", "") 353 354 elif re.search(full_name_pattern, ref): 355 First_AU = re.split(", ", ref, 1)[0] 356 while re.search(full_name_pattern, ref): 357 ref = re.sub(full_name_pattern, "", ref, 1) 358 359 else: 360 return None 361 362 # Title and Source 363 if ref != "": 364 comma_pattern = r", (?![^\[]*\]|[^(]*\))" 365 comma_count = len(re.findall(comma_pattern, ref)) 366 if comma_count == 0: 367 TI = ref 368 elif comma_count == 1: 369 TI, SO = re.split(comma_pattern, ref) 370 else: 371 # conference ref 372 if re.search( 373 r"[Cc]onference|Conf\.|[Pp]roceeding|Proc\.|[Cc]ommittee|[Ss]ymposium|[Cc]onvention|[Cc]ongress", 374 ref, 375 ): 376 TI, SO = ref.split(", ", 1) 377 378 # match source 379 elif SO_match := re.search(r", ([A-Z\d][\w\s\.\-&:]+)$", ref): 380 SO = SO_match[1] 381 TI = ref.replace(f", {SO}", "") 382 383 # match title 384 elif TI_match := re.search( 385 r"^(([^\.\s]+ ){3,}[^\.\sA-Z]+), [A-Z]", ref 386 ): 387 TI = TI_match[1] 388 SO = ref.replace(f"{TI}, ", "") 389 390 elif re.search(r"^[A-Z][^A-Z]+$", ref): 391 TI = ref 392 393 else: 394 return None 395 396 result = asdict(ScopusField(First_AU, TI, SO, VL, IS, BP, EP, PY)) 397 if doc_index is not None: 398 result["doc_index"] = doc_index 399 return result 400 401 def parse_one_ref( 402 self, ref: str, source_type: Literal["wos", "cssci", "scopus"], doc_index=None 403 ): 404 """Parse a raw reference string. 405 406 Args: 407 ref: A raw reference string. 408 source_type: Source type, `wos`, `cssci` or `scopus`. 409 doc_index: doc_index to which the reference belongs. Default is `None`. 410 411 Returns: 412 Parsed reference as a dict. 413 414 Example: 415 >>> from histcite.parse_reference import ParseReference 416 >>> ref = 'Bengio Y, 2001, ADV NEUR IN, V13, P932' 417 >>> parsed_ref = ParseReference().parse_one_ref(ref, 'wos') 418 >>> print(parsed_ref) 419 {'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'} 420 """ 421 if source_type == "wos": 422 return self._parse_wos_ref(ref, doc_index) 423 elif source_type == "cssci": 424 return self._parse_cssci_ref(ref, doc_index) 425 elif source_type == "scopus": 426 return self._parse_scopus_ref(ref, doc_index) 427 else: 428 raise ValueError("Invalid source type") 429 430 def parse_ref_cell( 431 self, 432 ref_cell: str, 433 source_type: Literal["wos", "cssci", "scopus"], 434 doc_index=None, 435 ) -> Optional[list[dict[str, Optional[str]]]]: 436 """Parse a reference cell. 437 438 Args: 439 ref_cell: A reference cell. 440 source_type: Source type, `wos`, `cssci` or `scopus`. 441 doc_index: doc_index to which the reference cell belongs. Default is `None`. 442 443 Returns: 444 List of parsed references. 445 446 Example: 447 >>> from histcite.parse_reference import ParseReference 448 >>> ref_cell = 'Bengio Y, 2001, ADV NEUR IN, V13, P932; CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018' 449 >>> parsed_ref_list = ParseReference().parse_ref_cell(ref_cell, 'wos') 450 >>> print(parsed_ref_list) 451 [{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}, {'First_AU': 'CORTES C', 'PY': '1995', 'J9': 'MACH LEARN', 'VL': '20', 'BP': '273'}] 452 """ 453 sep = "; " 454 try: 455 ref_list = re.split(sep, ref_cell) 456 except: 457 return None 458 459 parsed_ref_list: list[Optional[dict[str, Optional[str]]]] 460 parsed_ref_list = [ 461 self.parse_one_ref(ref, source_type, doc_index) for ref in ref_list 462 ] 463 parse_ref_list = [ref for ref in parsed_ref_list if ref is not None] 464 return parse_ref_list
401 def parse_one_ref( 402 self, ref: str, source_type: Literal["wos", "cssci", "scopus"], doc_index=None 403 ): 404 """Parse a raw reference string. 405 406 Args: 407 ref: A raw reference string. 408 source_type: Source type, `wos`, `cssci` or `scopus`. 409 doc_index: doc_index to which the reference belongs. Default is `None`. 410 411 Returns: 412 Parsed reference as a dict. 413 414 Example: 415 >>> from histcite.parse_reference import ParseReference 416 >>> ref = 'Bengio Y, 2001, ADV NEUR IN, V13, P932' 417 >>> parsed_ref = ParseReference().parse_one_ref(ref, 'wos') 418 >>> print(parsed_ref) 419 {'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'} 420 """ 421 if source_type == "wos": 422 return self._parse_wos_ref(ref, doc_index) 423 elif source_type == "cssci": 424 return self._parse_cssci_ref(ref, doc_index) 425 elif source_type == "scopus": 426 return self._parse_scopus_ref(ref, doc_index) 427 else: 428 raise ValueError("Invalid source type")
Parse a raw reference string.
Arguments:
- ref: A raw reference string.
- source_type: Source type,
wos
,cssci
orscopus
. - doc_index: doc_index to which the reference belongs. Default is
None
.
Returns:
Parsed reference as a dict.
Example:
>>> from histcite.parse_reference import ParseReference >>> ref = 'Bengio Y, 2001, ADV NEUR IN, V13, P932' >>> parsed_ref = ParseReference().parse_one_ref(ref, 'wos') >>> print(parsed_ref) {'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}
430 def parse_ref_cell( 431 self, 432 ref_cell: str, 433 source_type: Literal["wos", "cssci", "scopus"], 434 doc_index=None, 435 ) -> Optional[list[dict[str, Optional[str]]]]: 436 """Parse a reference cell. 437 438 Args: 439 ref_cell: A reference cell. 440 source_type: Source type, `wos`, `cssci` or `scopus`. 441 doc_index: doc_index to which the reference cell belongs. Default is `None`. 442 443 Returns: 444 List of parsed references. 445 446 Example: 447 >>> from histcite.parse_reference import ParseReference 448 >>> ref_cell = 'Bengio Y, 2001, ADV NEUR IN, V13, P932; CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018' 449 >>> parsed_ref_list = ParseReference().parse_ref_cell(ref_cell, 'wos') 450 >>> print(parsed_ref_list) 451 [{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}, {'First_AU': 'CORTES C', 'PY': '1995', 'J9': 'MACH LEARN', 'VL': '20', 'BP': '273'}] 452 """ 453 sep = "; " 454 try: 455 ref_list = re.split(sep, ref_cell) 456 except: 457 return None 458 459 parsed_ref_list: list[Optional[dict[str, Optional[str]]]] 460 parsed_ref_list = [ 461 self.parse_one_ref(ref, source_type, doc_index) for ref in ref_list 462 ] 463 parse_ref_list = [ref for ref in parsed_ref_list if ref is not None] 464 return parse_ref_list
Parse a reference cell.
Arguments:
- ref_cell: A reference cell.
- source_type: Source type,
wos
,cssci
orscopus
. - doc_index: doc_index to which the reference cell belongs. Default is
None
.
Returns:
List of parsed references.
Example:
>>> from histcite.parse_reference import ParseReference >>> ref_cell = 'Bengio Y, 2001, ADV NEUR IN, V13, P932; CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018' >>> parsed_ref_list = ParseReference().parse_ref_cell(ref_cell, 'wos') >>> print(parsed_ref_list) [{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}, {'First_AU': 'CORTES C', 'PY': '1995', 'J9': 'MACH LEARN', 'VL': '20', 'BP': '273'}]
8class ProcessFile: 9 """Process docs file, extract references and citation relationship. 10 11 Attributes: 12 docs_df: DataFrame of docs. 13 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 14 """ 15 16 def __init__( 17 self, docs_df: pd.DataFrame, source_type: Literal["wos", "cssci", "scopus"] 18 ): 19 """ 20 Args: 21 docs_df: DataFrame of docs. 22 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 23 """ 24 self.docs_df: pd.DataFrame = docs_df.copy() 25 self.source_type: Literal["wos", "cssci", "scopus"] = source_type 26 27 @staticmethod 28 def _concat_refs( 29 cr_field_series: pd.Series, 30 source_type: Literal["wos", "cssci", "scopus"], 31 ) -> pd.DataFrame: 32 """Concat all parsed references and return dataframe. 33 34 Args: 35 cr_field_series: The CR field of docs_df. 36 source_type: Source type of docs, 'wos', 'cssci' or 'scopus'. 37 38 Returns: 39 DataFrame of references. 40 """ 41 total_ref_list: list[dict[str, Optional[str]]] = [] 42 for idx, cell in cr_field_series.items(): 43 parse_result = ParseReference().parse_ref_cell(cell, source_type, idx) 44 if parse_result is not None: 45 total_ref_list.extend(parse_result) 46 refs_df = pd.DataFrame(total_ref_list) 47 return refs_df 48 49 def extract_reference(self) -> pd.DataFrame: 50 """Extract total references and return reference dataframe.""" 51 cr_field_series = self.docs_df["CR"] 52 if self.source_type == "wos": 53 refs_df = self._concat_refs(cr_field_series, "wos") 54 elif self.source_type == "cssci": 55 refs_df = self._concat_refs(cr_field_series, "cssci") 56 elif self.source_type == "scopus": 57 refs_df = self._concat_refs(cr_field_series, "scopus") 58 else: 59 raise ValueError("Invalid source type") 60 61 # Maybe duplicate reference in some docs' references 62 refs_df.drop_duplicates(ignore_index=True, inplace=True) 63 refs_df.insert(0, "ref_index", refs_df.index) 64 return refs_df 65 66 @staticmethod 67 def _reference2citation(cited_doc_index_series: pd.Series) -> pd.Series: 68 citing_doc_index_series = pd.Series( 69 [[] for i in range(len(cited_doc_index_series))] 70 ) 71 for doc_index, ref_list in cited_doc_index_series.items(): 72 if len(ref_list) > 0: 73 for ref_index in ref_list: 74 citing_doc_index_series[ref_index].append(doc_index) 75 return citing_doc_index_series 76 77 def process_citation(self, refs_df: pd.DataFrame) -> pd.DataFrame: 78 """Return citation_relationship dataframe.""" 79 if self.source_type == "wos": 80 self.docs_df["DI"] = self.docs_df["DI"].str.lower() 81 refs_df = refs_df.astype({"PY": "int64[pyarrow]"}) 82 ( 83 cited_doc_index_series, 84 local_refs_series, 85 ) = RecognizeReference.recognize_wos_reference(self.docs_df, refs_df) 86 87 elif self.source_type == "cssci": 88 self.docs_df["TI"] = self.docs_df["TI"].str.lower() 89 refs_df["TI"] = refs_df["TI"].str.lower() 90 ( 91 cited_doc_index_series, 92 local_refs_series, 93 ) = RecognizeReference.recognize_cssci_reference(self.docs_df, refs_df) 94 95 elif self.source_type == "scopus": 96 self.docs_df["TI"] = self.docs_df["TI"].str.lower() 97 refs_df["TI"] = refs_df["TI"].str.lower() 98 ( 99 cited_doc_index_series, 100 local_refs_series, 101 ) = RecognizeReference.recognize_scopus_reference(self.docs_df, refs_df) 102 else: 103 raise ValueError("Invalid source type") 104 105 cited_doc_index_series = cited_doc_index_series.reindex( 106 self.docs_df["doc_index"] 107 ) 108 cited_doc_index_series = cited_doc_index_series.apply( 109 lambda x: x if isinstance(x, list) else [] 110 ) 111 citing_doc_index_series = self._reference2citation(cited_doc_index_series) 112 lcr_field = cited_doc_index_series.apply(len) 113 lcs_field = citing_doc_index_series.apply(len) 114 citation_relationship = pd.DataFrame({"doc_index": self.docs_df.doc_index}) 115 citation_relationship["cited_doc_index"] = [ 116 ";".join([str(j) for j in i]) if i else None for i in cited_doc_index_series 117 ] 118 citation_relationship["citing_doc_index"] = [ 119 ";".join([str(j) for j in i]) if i else None 120 for i in citing_doc_index_series 121 ] 122 citation_relationship["LCR"] = lcr_field 123 citation_relationship["LCS"] = lcs_field 124 return citation_relationship
Process docs file, extract references and citation relationship.
Attributes:
- docs_df: DataFrame of docs.
- source_type: Source type of docs,
wos
,cssci
orscopus
.
16 def __init__( 17 self, docs_df: pd.DataFrame, source_type: Literal["wos", "cssci", "scopus"] 18 ): 19 """ 20 Args: 21 docs_df: DataFrame of docs. 22 source_type: Source type of docs, `wos`, `cssci` or `scopus`. 23 """ 24 self.docs_df: pd.DataFrame = docs_df.copy() 25 self.source_type: Literal["wos", "cssci", "scopus"] = source_type
Arguments:
- docs_df: DataFrame of docs.
- source_type: Source type of docs,
wos
,cssci
orscopus
.
49 def extract_reference(self) -> pd.DataFrame: 50 """Extract total references and return reference dataframe.""" 51 cr_field_series = self.docs_df["CR"] 52 if self.source_type == "wos": 53 refs_df = self._concat_refs(cr_field_series, "wos") 54 elif self.source_type == "cssci": 55 refs_df = self._concat_refs(cr_field_series, "cssci") 56 elif self.source_type == "scopus": 57 refs_df = self._concat_refs(cr_field_series, "scopus") 58 else: 59 raise ValueError("Invalid source type") 60 61 # Maybe duplicate reference in some docs' references 62 refs_df.drop_duplicates(ignore_index=True, inplace=True) 63 refs_df.insert(0, "ref_index", refs_df.index) 64 return refs_df
Extract total references and return reference dataframe.
77 def process_citation(self, refs_df: pd.DataFrame) -> pd.DataFrame: 78 """Return citation_relationship dataframe.""" 79 if self.source_type == "wos": 80 self.docs_df["DI"] = self.docs_df["DI"].str.lower() 81 refs_df = refs_df.astype({"PY": "int64[pyarrow]"}) 82 ( 83 cited_doc_index_series, 84 local_refs_series, 85 ) = RecognizeReference.recognize_wos_reference(self.docs_df, refs_df) 86 87 elif self.source_type == "cssci": 88 self.docs_df["TI"] = self.docs_df["TI"].str.lower() 89 refs_df["TI"] = refs_df["TI"].str.lower() 90 ( 91 cited_doc_index_series, 92 local_refs_series, 93 ) = RecognizeReference.recognize_cssci_reference(self.docs_df, refs_df) 94 95 elif self.source_type == "scopus": 96 self.docs_df["TI"] = self.docs_df["TI"].str.lower() 97 refs_df["TI"] = refs_df["TI"].str.lower() 98 ( 99 cited_doc_index_series, 100 local_refs_series, 101 ) = RecognizeReference.recognize_scopus_reference(self.docs_df, refs_df) 102 else: 103 raise ValueError("Invalid source type") 104 105 cited_doc_index_series = cited_doc_index_series.reindex( 106 self.docs_df["doc_index"] 107 ) 108 cited_doc_index_series = cited_doc_index_series.apply( 109 lambda x: x if isinstance(x, list) else [] 110 ) 111 citing_doc_index_series = self._reference2citation(cited_doc_index_series) 112 lcr_field = cited_doc_index_series.apply(len) 113 lcs_field = citing_doc_index_series.apply(len) 114 citation_relationship = pd.DataFrame({"doc_index": self.docs_df.doc_index}) 115 citation_relationship["cited_doc_index"] = [ 116 ";".join([str(j) for j in i]) if i else None for i in cited_doc_index_series 117 ] 118 citation_relationship["citing_doc_index"] = [ 119 ";".join([str(j) for j in i]) if i else None 120 for i in citing_doc_index_series 121 ] 122 citation_relationship["LCR"] = lcr_field 123 citation_relationship["LCS"] = lcs_field 124 return citation_relationship
Return citation_relationship dataframe.
182class ReadFile: 183 """Read files in a folder and return a dataframe. 184 185 Attributes: 186 folder_path: Path of a folder. 187 source_type: Type of source, `wos`, `cssci` and `scopus`. 188 file_path_list: List of valid file path. Ignore the file which doesn't comply with naming rules. 189 """ 190 191 def __init__( 192 self, folder_path: str, source_type: Literal["wos", "cssci", "scopus"] 193 ): 194 """ 195 Args: 196 folder_path: Path of a folder. 197 source_type: Type of source, `wos`, `cssci` and `scopus`. 198 """ 199 self.folder_path: str = folder_path 200 self.source_type: Literal["wos", "cssci", "scopus"] = source_type 201 self.file_path_list: list[str] = self._obrain_file_path_list() 202 203 def _obrain_file_path_list(self) -> list[str]: 204 if self.source_type == "wos": 205 file_name_list = [ 206 i for i in os.listdir(self.folder_path) if i[:9] == "savedrecs" 207 ] 208 elif self.source_type == "cssci": 209 file_name_list = [i for i in os.listdir(self.folder_path) if i[:3] == "LY_"] 210 elif self.source_type == "scopus": 211 file_name_list = [ 212 i for i in os.listdir(self.folder_path) if i[:6] == "scopus" 213 ] 214 else: 215 raise ValueError("Invalid source type") 216 file_name_list.sort() 217 return [ 218 os.path.join(self.folder_path, file_name) for file_name in file_name_list 219 ] 220 221 def _concat_df(self, read_file_func: Callable[[str], pd.DataFrame]) -> pd.DataFrame: 222 file_count = len(self.file_path_list) 223 if file_count > 1: 224 return pd.concat( 225 [read_file_func(file_path) for file_path in self.file_path_list], 226 ignore_index=True, 227 copy=False, 228 ) 229 elif file_count == 1: 230 return read_file_func(self.file_path_list[0]) 231 else: 232 raise FileNotFoundError("No valid file in the folder") 233 234 def read_all(self) -> pd.DataFrame: 235 """Concat multi dataframe and drop duplicate rows. 236 237 if wos, drop duplicate rows by `UT`. 238 239 if cssci, drop duplicate rows by `TI` and `First_AU`. 240 241 if scopus, drop duplicate rows by `EID`. 242 """ 243 if self.source_type == "wos": 244 docs_df = self._concat_df(ReadWosFile.read_wos_file) 245 elif self.source_type == "cssci": 246 docs_df = self._concat_df(ReadCssciFile.read_cssci_file) 247 elif self.source_type == "scopus": 248 docs_df = self._concat_df(ReadScopusFile.read_scopus_file) 249 else: 250 raise ValueError("Invalid source type") 251 252 # Drop duplicate rows 253 original_num = docs_df.shape[0] 254 if self.source_type == "wos": 255 check_cols = ["UT"] 256 elif self.source_type == "cssci": 257 check_cols = ["TI", "First_AU"] 258 elif self.source_type == "scopus": 259 check_cols = ["EID"] 260 else: 261 raise ValueError("Invalid source type") 262 docs_df.drop_duplicates(subset=check_cols, ignore_index=True, inplace=True) 263 current_num = docs_df.shape[0] 264 print(f"共读取 {original_num} 条数据,去重后剩余 {current_num} 条") 265 docs_df.insert(0, "doc_index", docs_df.index) 266 return docs_df
Read files in a folder and return a dataframe.
Attributes:
- folder_path: Path of a folder.
- source_type: Type of source,
wos
,cssci
andscopus
. - file_path_list: List of valid file path. Ignore the file which doesn't comply with naming rules.
191 def __init__( 192 self, folder_path: str, source_type: Literal["wos", "cssci", "scopus"] 193 ): 194 """ 195 Args: 196 folder_path: Path of a folder. 197 source_type: Type of source, `wos`, `cssci` and `scopus`. 198 """ 199 self.folder_path: str = folder_path 200 self.source_type: Literal["wos", "cssci", "scopus"] = source_type 201 self.file_path_list: list[str] = self._obrain_file_path_list()
Arguments:
- folder_path: Path of a folder.
- source_type: Type of source,
wos
,cssci
andscopus
.
234 def read_all(self) -> pd.DataFrame: 235 """Concat multi dataframe and drop duplicate rows. 236 237 if wos, drop duplicate rows by `UT`. 238 239 if cssci, drop duplicate rows by `TI` and `First_AU`. 240 241 if scopus, drop duplicate rows by `EID`. 242 """ 243 if self.source_type == "wos": 244 docs_df = self._concat_df(ReadWosFile.read_wos_file) 245 elif self.source_type == "cssci": 246 docs_df = self._concat_df(ReadCssciFile.read_cssci_file) 247 elif self.source_type == "scopus": 248 docs_df = self._concat_df(ReadScopusFile.read_scopus_file) 249 else: 250 raise ValueError("Invalid source type") 251 252 # Drop duplicate rows 253 original_num = docs_df.shape[0] 254 if self.source_type == "wos": 255 check_cols = ["UT"] 256 elif self.source_type == "cssci": 257 check_cols = ["TI", "First_AU"] 258 elif self.source_type == "scopus": 259 check_cols = ["EID"] 260 else: 261 raise ValueError("Invalid source type") 262 docs_df.drop_duplicates(subset=check_cols, ignore_index=True, inplace=True) 263 current_num = docs_df.shape[0] 264 print(f"共读取 {original_num} 条数据,去重后剩余 {current_num} 条") 265 docs_df.insert(0, "doc_index", docs_df.index) 266 return docs_df
Concat multi dataframe and drop duplicate rows.
if wos, drop duplicate rows by UT
.
if cssci, drop duplicate rows by TI
and First_AU
.
if scopus, drop duplicate rows by EID
.
32class ReadWosFile: 33 @staticmethod 34 def _extract_first_author(au_field: pd.Series) -> pd.Series: 35 return au_field.str.split(pat=";", n=1, expand=True)[0].str.replace(",", "") 36 37 @staticmethod 38 def read_wos_file(file_path: str) -> pd.DataFrame: 39 """Read Web of Science file and return dataframe. 40 41 Args: 42 file_path: Path of a Web of Science file. File name is similar to `savedrecs.txt`. 43 """ 44 use_cols = [ 45 "AU", 46 "TI", 47 "SO", 48 "DT", 49 "CR", 50 "DE", 51 "C3", 52 "NR", 53 "TC", 54 "J9", 55 "PY", 56 "VL", 57 "BP", 58 "DI", 59 "UT", 60 ] 61 df = read_csv_file(file_path, use_cols, "\t") 62 df.insert(1, "First_AU", ReadWosFile._extract_first_author(df["AU"])) 63 df["source file"] = os.path.basename(file_path) 64 return df
37 @staticmethod 38 def read_wos_file(file_path: str) -> pd.DataFrame: 39 """Read Web of Science file and return dataframe. 40 41 Args: 42 file_path: Path of a Web of Science file. File name is similar to `savedrecs.txt`. 43 """ 44 use_cols = [ 45 "AU", 46 "TI", 47 "SO", 48 "DT", 49 "CR", 50 "DE", 51 "C3", 52 "NR", 53 "TC", 54 "J9", 55 "PY", 56 "VL", 57 "BP", 58 "DI", 59 "UT", 60 ] 61 df = read_csv_file(file_path, use_cols, "\t") 62 df.insert(1, "First_AU", ReadWosFile._extract_first_author(df["AU"])) 63 df["source file"] = os.path.basename(file_path) 64 return df
Read Web of Science file and return dataframe.
Arguments:
- file_path: Path of a Web of Science file. File name is similar to
savedrecs.txt
.
67class ReadCssciFile: 68 @staticmethod 69 def _extract_org(org_cell: str) -> str: 70 org_set = set(re.findall(r"](.*?)(?:/|$)", org_cell)) 71 org_list = [i.replace(".", "") for i in org_set] 72 return "; ".join(org_list) 73 74 @staticmethod 75 def read_cssci_file(file_path: str) -> pd.DataFrame: 76 """Read CSSCI file and return dataframe. Use `WOS` fields to replace original fields. 77 78 Args: 79 file_path: Path of a CSSCI file. File name is similar to `LY_.txt`. 80 """ 81 with open(file_path, "r") as f: 82 text = f.read() 83 84 body_text = text.split("\n\n\n", 1)[1] 85 contents = {} 86 original_fields = [ 87 "来源篇名", 88 "来源作者", 89 "基 金", 90 "期 刊", 91 "机构名称", 92 "第一作者", 93 "年代卷期", 94 "关 键 词", 95 "参考文献", 96 ] 97 for field in original_fields: 98 if field != "参考文献": 99 field_pattern = f"【{field}】(.*?)\n" 100 contents[field] = re.findall(field_pattern, body_text) 101 else: 102 field_pattern = "【参考文献】\n(.*?)\n?" + "-" * 5 103 contents[field] = re.findall(field_pattern, body_text, flags=re.S) 104 105 df = pd.DataFrame.from_dict(contents) 106 # Rename columns 107 column_mapping = { 108 "来源篇名": "TI", 109 "来源作者": "AU", 110 "基 金": "FU", 111 "期 刊": "SO", 112 "机构名称": "C3", 113 "第一作者": "First_AU", 114 "年代卷期": "PY&VL&BP&EP", 115 "关 键 词": "DE", 116 "参考文献": "CR", 117 } 118 df.rename(columns=column_mapping, inplace=True) 119 120 df["AU"] = df["AU"].str.replace("/", "; ") 121 df["DE"] = df["DE"].str.replace("/", "; ") 122 df["PY"] = df["PY&VL&BP&EP"].str.extract(r"^(\d{4}),", expand=False) 123 df["C3"] = df["C3"].apply(ReadCssciFile._extract_org) 124 df["CR"] = df["CR"].str.replace("\n", "; ") 125 df["NR"] = df["CR"].str.count("; ") 126 df.insert(2, "First_AU", df.pop("First_AU")) 127 df["source file"] = os.path.basename(file_path) 128 return df
74 @staticmethod 75 def read_cssci_file(file_path: str) -> pd.DataFrame: 76 """Read CSSCI file and return dataframe. Use `WOS` fields to replace original fields. 77 78 Args: 79 file_path: Path of a CSSCI file. File name is similar to `LY_.txt`. 80 """ 81 with open(file_path, "r") as f: 82 text = f.read() 83 84 body_text = text.split("\n\n\n", 1)[1] 85 contents = {} 86 original_fields = [ 87 "来源篇名", 88 "来源作者", 89 "基 金", 90 "期 刊", 91 "机构名称", 92 "第一作者", 93 "年代卷期", 94 "关 键 词", 95 "参考文献", 96 ] 97 for field in original_fields: 98 if field != "参考文献": 99 field_pattern = f"【{field}】(.*?)\n" 100 contents[field] = re.findall(field_pattern, body_text) 101 else: 102 field_pattern = "【参考文献】\n(.*?)\n?" + "-" * 5 103 contents[field] = re.findall(field_pattern, body_text, flags=re.S) 104 105 df = pd.DataFrame.from_dict(contents) 106 # Rename columns 107 column_mapping = { 108 "来源篇名": "TI", 109 "来源作者": "AU", 110 "基 金": "FU", 111 "期 刊": "SO", 112 "机构名称": "C3", 113 "第一作者": "First_AU", 114 "年代卷期": "PY&VL&BP&EP", 115 "关 键 词": "DE", 116 "参考文献": "CR", 117 } 118 df.rename(columns=column_mapping, inplace=True) 119 120 df["AU"] = df["AU"].str.replace("/", "; ") 121 df["DE"] = df["DE"].str.replace("/", "; ") 122 df["PY"] = df["PY&VL&BP&EP"].str.extract(r"^(\d{4}),", expand=False) 123 df["C3"] = df["C3"].apply(ReadCssciFile._extract_org) 124 df["CR"] = df["CR"].str.replace("\n", "; ") 125 df["NR"] = df["CR"].str.count("; ") 126 df.insert(2, "First_AU", df.pop("First_AU")) 127 df["source file"] = os.path.basename(file_path) 128 return df
Read CSSCI file and return dataframe. Use WOS
fields to replace original fields.
Arguments:
- file_path: Path of a CSSCI file. File name is similar to
LY_.txt
.
131class ReadScopusFile: 132 @staticmethod 133 def read_scopus_file(file_path: str) -> pd.DataFrame: 134 """Read Scopus file return dataframe. Use `WOS` fields to replace original fields. 135 136 Args: 137 file_path: Path of a Scopus file. File name is similar to `scopus.csv`. 138 """ 139 use_cols = [ 140 "Authors", 141 "Author full names", 142 "Title", 143 "Year", 144 "Source title", 145 "Volume", 146 "Issue", 147 "Page start", 148 "Page end", 149 "Cited by", 150 "DOI", 151 "Author Keywords", 152 "References", 153 "Document Type", 154 "EID", 155 ] 156 157 df = read_csv_file(file_path, use_cols) 158 # Rename columns 159 column_mapping = { 160 "Authors": "AU", 161 "Title": "TI", 162 "Year": "PY", 163 "Source title": "SO", 164 "Volume": "VL", 165 "Issue": "IS", 166 "Page start": "BP", 167 "Page end": "EP", 168 "Cited by": "TC", 169 "DOI": "DI", 170 "Author Keywords": "DE", 171 "References": "CR", 172 "Document Type": "DT", 173 } 174 df.rename(columns=column_mapping, inplace=True) 175 176 df["NR"] = df["CR"].str.count("; ") 177 df.insert(1, "First_AU", df["AU"].str.split(pat=";", n=1, expand=True)[0]) 178 df["source file"] = os.path.basename(file_path) 179 return df
132 @staticmethod 133 def read_scopus_file(file_path: str) -> pd.DataFrame: 134 """Read Scopus file return dataframe. Use `WOS` fields to replace original fields. 135 136 Args: 137 file_path: Path of a Scopus file. File name is similar to `scopus.csv`. 138 """ 139 use_cols = [ 140 "Authors", 141 "Author full names", 142 "Title", 143 "Year", 144 "Source title", 145 "Volume", 146 "Issue", 147 "Page start", 148 "Page end", 149 "Cited by", 150 "DOI", 151 "Author Keywords", 152 "References", 153 "Document Type", 154 "EID", 155 ] 156 157 df = read_csv_file(file_path, use_cols) 158 # Rename columns 159 column_mapping = { 160 "Authors": "AU", 161 "Title": "TI", 162 "Year": "PY", 163 "Source title": "SO", 164 "Volume": "VL", 165 "Issue": "IS", 166 "Page start": "BP", 167 "Page end": "EP", 168 "Cited by": "TC", 169 "DOI": "DI", 170 "Author Keywords": "DE", 171 "References": "CR", 172 "Document Type": "DT", 173 } 174 df.rename(columns=column_mapping, inplace=True) 175 176 df["NR"] = df["CR"].str.count("; ") 177 df.insert(1, "First_AU", df["AU"].str.split(pat=";", n=1, expand=True)[0]) 178 df["source file"] = os.path.basename(file_path) 179 return df
Read Scopus file return dataframe. Use WOS
fields to replace original fields.
Arguments:
- file_path: Path of a Scopus file. File name is similar to
scopus.csv
.
7class RecognizeReference: 8 @staticmethod 9 def recognize_refs_factory( 10 docs_df: pd.DataFrame, 11 refs_df: pd.DataFrame, 12 compare_cols: list[str], 13 drop_duplicates: bool = False, 14 ): 15 """ 16 Recognize local references of a doc. 17 18 Args: 19 docs_df: DataFrame of docs. 20 refs_df: DataFrame of references. 21 compare_cols: Columns to compare. e.g. `["First_AU", "TI"]`. 22 drop_duplicates: Whether to drop duplicated rows with same values in `compare_cols`. Default is False. 23 24 Returns: 25 Tuple of two Series, cited_refs_series and local_refs_series. 26 27 cited_refs_series: A Series of lists, each list contains the indexes of local references. 28 local_refs_series: A Series of indexes of local references. 29 """ 30 # Drop rows with missing values 31 docs_df = docs_df.dropna(subset=compare_cols) 32 refs_df = refs_df.dropna(subset=compare_cols) 33 34 if drop_duplicates is True: 35 docs_df = docs_df.drop_duplicates(subset=compare_cols) 36 37 docs_df = docs_df[["doc_index"] + compare_cols] 38 refs_df = refs_df[["doc_index", "ref_index"] + compare_cols] 39 shared_df = pd.merge( 40 refs_df, docs_df, how="left", on=compare_cols, suffixes=("_x", "_y") 41 ).dropna(subset="doc_index_y") 42 shared_df = shared_df.astype({"doc_index_y": "int64"}) 43 cited_refs_series = shared_df.groupby("doc_index_x")["doc_index_y"].apply(list) 44 cited_refs_series = cited_refs_series.apply(lambda x: sorted(x)) 45 local_refs_series = shared_df["ref_index"].reset_index(drop=True) 46 return cited_refs_series, local_refs_series 47 48 @staticmethod 49 def recognize_wos_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame): 50 """Recognize local references of a doc from Web of Science. 51 52 If `DOI` exists, use `DOI` to recognize references. 53 Otherwise, use `First_AU`, `PY`, `J9`, `BP` to recognize references. 54 55 Args: 56 docs_df: DataFrame of docs. 57 refs_df: DataFrame of references. 58 59 Returns: 60 Tuple of two Series, cited_refs_series and local_refs_series. 61 """ 62 63 def _merge_lists(list1: Optional[list[int]], list2: Optional[list[int]]): 64 if isinstance(list1, list) and isinstance(list2, list): 65 return list1 + list2 66 else: 67 if isinstance(list1, list): 68 return list1 69 else: 70 return list2 71 72 # DOI exists 73 compare_cols_doi = ["DI"] 74 result_doi = RecognizeReference.recognize_refs_factory( 75 docs_df, refs_df, compare_cols_doi 76 ) 77 78 # DOI not exists 79 compare_cols = ["First_AU", "PY", "J9", "BP"] 80 result = RecognizeReference.recognize_refs_factory( 81 docs_df[docs_df["DI"].isna()], refs_df[refs_df["DI"].isna()], compare_cols 82 ) 83 cited_refs_series = result_doi[0].combine(result[0], _merge_lists) 84 local_refs_series = pd.concat([result_doi[1], result[1]]) 85 return cited_refs_series, local_refs_series 86 87 @staticmethod 88 def recognize_cssci_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame): 89 """Recognize local references of a doc from CSSCI. 90 91 Use `First_AU`, `TI` to recognize references. 92 93 Args: 94 docs_df: DataFrame of docs. 95 refs_df: DataFrame of references. 96 97 Returns: 98 Tuple of two Series, cited_refs_series and local_refs_series. 99 """ 100 compare_cols = ["First_AU", "TI"] 101 return RecognizeReference.recognize_refs_factory(docs_df, refs_df, compare_cols) 102 103 @staticmethod 104 def recognize_scopus_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame): 105 """Recognize local references of a doc from Scopus. 106 107 Use `First_AU`, `TI` to recognize references. 108 109 Args: 110 docs_df: DataFrame of docs. 111 refs_df: DataFrame of references. 112 113 Returns: 114 Tuple of two Series, cited_refs_series and local_refs_series. 115 """ 116 compare_cols = ["First_AU", "TI"] 117 return RecognizeReference.recognize_refs_factory( 118 docs_df, refs_df, compare_cols, drop_duplicates=True 119 )
8 @staticmethod 9 def recognize_refs_factory( 10 docs_df: pd.DataFrame, 11 refs_df: pd.DataFrame, 12 compare_cols: list[str], 13 drop_duplicates: bool = False, 14 ): 15 """ 16 Recognize local references of a doc. 17 18 Args: 19 docs_df: DataFrame of docs. 20 refs_df: DataFrame of references. 21 compare_cols: Columns to compare. e.g. `["First_AU", "TI"]`. 22 drop_duplicates: Whether to drop duplicated rows with same values in `compare_cols`. Default is False. 23 24 Returns: 25 Tuple of two Series, cited_refs_series and local_refs_series. 26 27 cited_refs_series: A Series of lists, each list contains the indexes of local references. 28 local_refs_series: A Series of indexes of local references. 29 """ 30 # Drop rows with missing values 31 docs_df = docs_df.dropna(subset=compare_cols) 32 refs_df = refs_df.dropna(subset=compare_cols) 33 34 if drop_duplicates is True: 35 docs_df = docs_df.drop_duplicates(subset=compare_cols) 36 37 docs_df = docs_df[["doc_index"] + compare_cols] 38 refs_df = refs_df[["doc_index", "ref_index"] + compare_cols] 39 shared_df = pd.merge( 40 refs_df, docs_df, how="left", on=compare_cols, suffixes=("_x", "_y") 41 ).dropna(subset="doc_index_y") 42 shared_df = shared_df.astype({"doc_index_y": "int64"}) 43 cited_refs_series = shared_df.groupby("doc_index_x")["doc_index_y"].apply(list) 44 cited_refs_series = cited_refs_series.apply(lambda x: sorted(x)) 45 local_refs_series = shared_df["ref_index"].reset_index(drop=True) 46 return cited_refs_series, local_refs_series
Recognize local references of a doc.
Arguments:
- docs_df: DataFrame of docs.
- refs_df: DataFrame of references.
- compare_cols: Columns to compare. e.g.
["First_AU", "TI"]
. - drop_duplicates: Whether to drop duplicated rows with same values in
compare_cols
. Default is False.
Returns:
Tuple of two Series, cited_refs_series and local_refs_series.
cited_refs_series: A Series of lists, each list contains the indexes of local references. local_refs_series: A Series of indexes of local references.
48 @staticmethod 49 def recognize_wos_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame): 50 """Recognize local references of a doc from Web of Science. 51 52 If `DOI` exists, use `DOI` to recognize references. 53 Otherwise, use `First_AU`, `PY`, `J9`, `BP` to recognize references. 54 55 Args: 56 docs_df: DataFrame of docs. 57 refs_df: DataFrame of references. 58 59 Returns: 60 Tuple of two Series, cited_refs_series and local_refs_series. 61 """ 62 63 def _merge_lists(list1: Optional[list[int]], list2: Optional[list[int]]): 64 if isinstance(list1, list) and isinstance(list2, list): 65 return list1 + list2 66 else: 67 if isinstance(list1, list): 68 return list1 69 else: 70 return list2 71 72 # DOI exists 73 compare_cols_doi = ["DI"] 74 result_doi = RecognizeReference.recognize_refs_factory( 75 docs_df, refs_df, compare_cols_doi 76 ) 77 78 # DOI not exists 79 compare_cols = ["First_AU", "PY", "J9", "BP"] 80 result = RecognizeReference.recognize_refs_factory( 81 docs_df[docs_df["DI"].isna()], refs_df[refs_df["DI"].isna()], compare_cols 82 ) 83 cited_refs_series = result_doi[0].combine(result[0], _merge_lists) 84 local_refs_series = pd.concat([result_doi[1], result[1]]) 85 return cited_refs_series, local_refs_series
Recognize local references of a doc from Web of Science.
If DOI
exists, use DOI
to recognize references.
Otherwise, use First_AU
, PY
, J9
, BP
to recognize references.
Arguments:
- docs_df: DataFrame of docs.
- refs_df: DataFrame of references.
Returns:
Tuple of two Series, cited_refs_series and local_refs_series.
87 @staticmethod 88 def recognize_cssci_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame): 89 """Recognize local references of a doc from CSSCI. 90 91 Use `First_AU`, `TI` to recognize references. 92 93 Args: 94 docs_df: DataFrame of docs. 95 refs_df: DataFrame of references. 96 97 Returns: 98 Tuple of two Series, cited_refs_series and local_refs_series. 99 """ 100 compare_cols = ["First_AU", "TI"] 101 return RecognizeReference.recognize_refs_factory(docs_df, refs_df, compare_cols)
Recognize local references of a doc from CSSCI.
Use First_AU
, TI
to recognize references.
Arguments:
- docs_df: DataFrame of docs.
- refs_df: DataFrame of references.
Returns:
Tuple of two Series, cited_refs_series and local_refs_series.
103 @staticmethod 104 def recognize_scopus_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame): 105 """Recognize local references of a doc from Scopus. 106 107 Use `First_AU`, `TI` to recognize references. 108 109 Args: 110 docs_df: DataFrame of docs. 111 refs_df: DataFrame of references. 112 113 Returns: 114 Tuple of two Series, cited_refs_series and local_refs_series. 115 """ 116 compare_cols = ["First_AU", "TI"] 117 return RecognizeReference.recognize_refs_factory( 118 docs_df, refs_df, compare_cols, drop_duplicates=True 119 )
Recognize local references of a doc from Scopus.
Use First_AU
, TI
to recognize references.
Arguments:
- docs_df: DataFrame of docs.
- refs_df: DataFrame of references.
Returns:
Tuple of two Series, cited_refs_series and local_refs_series.