Skip to content

Commit d74b01a

Browse files
committed
Merge branch 'dev' into jumpstart
Signed-off-by: Ulincsys <[email protected]>
2 parents 99a1bb0 + b778fcd commit d74b01a

File tree

40 files changed

+14128
-2388
lines changed

40 files changed

+14128
-2388
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Augur NEW Release v0.76.0
1+
# Augur NEW Release v0.76.1
22

33
Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
44
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
@@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
1010
## NEW RELEASE ALERT!
1111
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)
1212

13-
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.0
13+
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1
1414

1515
- The `main` branch is a stable version of our new architecture, which features:
1616
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.

augur/api/routes/pull_request_reports.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121
from bokeh.models.glyphs import Rect
2222
from bokeh.transform import dodge, factor_cmap, transform
2323

24+
# from selenium.webdriver import Firefox, FirefoxOptions
25+
# options = FirefoxOptions()
26+
# options.headless = True
27+
# webdriver = Firefox(options=options)
28+
#export_png(item, path, webdriver=webdriver)
29+
2430
warnings.filterwarnings('ignore')
2531

2632
from augur.api.routes import AUGUR_API_VERSION
@@ -604,6 +610,7 @@ def average_commits_per_PR():
604610
# opts = FirefoxOptions()
605611
# opts.add_argument("--headless")
606612
# driver = webdriver.Firefox(firefox_options=opts)
613+
# filename = export_png(grid, timeout=180, webdriver=webdriver)
607614
filename = export_png(grid, timeout=180)
608615

609616
return send_file(filename)

augur/application/cli/backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def determine_worker_processes(ratio,maximum):
194194
sleep_time += 6
195195

196196
#60% of estimate, Maximum value of 45 : Reduced because it can be lower
197-
core_num_processes = determine_worker_processes(.40, 50)
197+
core_num_processes = determine_worker_processes(.40, 90)
198198
logger.info(f"Starting core worker processes with concurrency={core_num_processes}")
199199
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h"
200200
process_list.append(subprocess.Popen(core_worker.split(" ")))

augur/application/cli/collection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def determine_worker_processes(ratio,maximum):
126126
sleep_time += 6
127127

128128
#60% of estimate, Maximum value of 45: Reduced because not needed
129-
core_num_processes = determine_worker_processes(.40, 50)
129+
core_num_processes = determine_worker_processes(.40, 90)
130130
logger.info(f"Starting core worker processes with concurrency={core_num_processes}")
131131
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h"
132132
process_list.append(subprocess.Popen(core_worker.split(" ")))

augur/application/cli/tasks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def start():
3636
secondary_worker_process = None
3737

3838
scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling"
39-
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=50 -n core:{uuid.uuid4().hex}@%h"
40-
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=50 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
39+
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=90 -n core:{uuid.uuid4().hex}@%h"
40+
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=20 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
4141

4242
scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" "))
4343
core_worker_process = subprocess.Popen(core_worker.split(" "))

augur/application/db/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
PullRequestTeam,
6464
PullRequestRepo,
6565
PullRequestReviewMessageRef,
66+
CommitMessage,
6667
RepoClone,
6768
)
6869

augur/application/db/models/augur_data.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,6 +1350,36 @@ class Commit(Base):
13501350
repo = relationship("Repo", back_populates="commits")
13511351
message_ref = relationship("CommitCommentRef", back_populates="cmt")
13521352

1353+
class CommitMessage(Base):
1354+
__tablename__ = "commit_messages"
1355+
__table_args__ = ( UniqueConstraint("repo_id","cmt_hash", name="commit-message-insert-unique"),
1356+
{
1357+
"schema": "augur_data",
1358+
"comment": "This table holds commit messages",
1359+
}
1360+
)
1361+
1362+
cmt_msg_id = Column(
1363+
BigInteger,
1364+
primary_key=True,
1365+
server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"),
1366+
)
1367+
1368+
repo_id = Column(
1369+
ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"),
1370+
nullable=False,
1371+
)
1372+
1373+
cmt_msg = Column(String, nullable=False)
1374+
1375+
cmt_hash = Column(String(80), nullable=False)
1376+
1377+
tool_source = Column(String)
1378+
tool_version = Column(String)
1379+
data_source = Column(String)
1380+
data_collection_date = Column(
1381+
TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP")
1382+
)
13531383

13541384
class Issue(Base):
13551385
__tablename__ = "issues"

augur/application/db/models/augur_operations.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,39 +30,38 @@ def retrieve_owner_repos(session, owner: str) -> List[str]:
3030
Returns
3131
List of valid repo urls or empty list if invalid org
3232
"""
33-
from augur.tasks.github.util.github_paginator import GithubPaginator, retrieve_dict_from_endpoint
33+
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
3434

3535
OWNER_INFO_ENDPOINT = f"https://api.github.com/users/{owner}"
3636
ORG_REPOS_ENDPOINT = f"https://api.github.com/orgs/{owner}/repos?per_page=100"
3737
USER_REPOS_ENDPOINT = f"https://api.github.com/users/{owner}/repos?per_page=100"
3838

39+
github_data_access = GithubDataAccess(session.oauths, logger)
40+
3941
if not session.oauths.list_of_keys:
4042
return None, {"status": "No valid github api keys to retrieve data with"}
4143

4244
# determine whether the owner is a user or an organization
43-
data, _ = retrieve_dict_from_endpoint(logger, session.oauths, OWNER_INFO_ENDPOINT)
44-
if not data:
45+
try:
46+
data = github_data_access.get_resource(OWNER_INFO_ENDPOINT)
47+
except UrlNotFoundException as e:
48+
logger.error("Owner not found on github")
4549
return None, {"status": "Invalid owner"}
50+
except Exception as e:
51+
logger.error(f"Failed to get owner data from github. Exception: {e}")
52+
return None, {"status": "Failed to get owner data from github"}
4653

4754
owner_type = data["type"]
4855

49-
5056
if owner_type == "User":
5157
url = USER_REPOS_ENDPOINT
5258
elif owner_type == "Organization":
5359
url = ORG_REPOS_ENDPOINT
5460
else:
5561
return None, {"status": f"Invalid owner type: {owner_type}"}
5662

57-
5863
# collect repo urls for the given owner
59-
repos = []
60-
for page_data, _ in GithubPaginator(url, session.oauths, logger).iter_pages():
61-
62-
if page_data is None:
63-
break
64-
65-
repos.extend(page_data)
64+
repos = list(github_data_access.paginate_resource(url))
6665

6766
repo_urls = [repo["html_url"] for repo in repos]
6867

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""Add commit message table
2+
3+
Revision ID: 29
4+
Revises: 28
5+
Create Date: 2024-07-25 12:02:57.185867
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
from sqlalchemy.dialects import postgresql
11+
12+
# revision identifiers, used by Alembic.
13+
revision = '29'
14+
down_revision = '28'
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
op.create_table('commit_messages',
22+
sa.Column('cmt_msg_id', sa.BigInteger(), server_default=sa.text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), nullable=False),
23+
sa.Column('repo_id', sa.BigInteger(), nullable=False),
24+
sa.Column('cmt_msg', sa.String(), nullable=False),
25+
sa.Column('cmt_hash', sa.String(length=80), nullable=False),
26+
sa.Column('tool_source', sa.String(), nullable=True),
27+
sa.Column('tool_version', sa.String(), nullable=True),
28+
sa.Column('data_source', sa.String(), nullable=True),
29+
sa.Column('data_collection_date', postgresql.TIMESTAMP(precision=0), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=True),
30+
sa.ForeignKeyConstraint(['repo_id'], ['augur_data.repo.repo_id'], onupdate='CASCADE', ondelete='RESTRICT'),
31+
sa.PrimaryKeyConstraint('cmt_msg_id'),
32+
sa.UniqueConstraint('repo_id', 'cmt_hash', name='commit-message-insert-unique'),
33+
schema='augur_data',
34+
comment='This table holds commit messages'
35+
)
36+
# ### end Alembic commands ###
37+
38+
39+
def downgrade():
40+
# ### commands auto generated by Alembic - please adjust! ###
41+
op.drop_table('commit_messages', schema='augur_data')
42+
# ### end Alembic commands ###

augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from datetime import datetime
55

66
from augur.tasks.init.celery_app import celery_app as celery
7-
from augur.tasks.github.util.github_paginator import GithubPaginator
7+
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
88
from augur.application.db.models import ContributorRepo
99
from augur.application.db.lib import bulk_insert_dicts
1010
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
@@ -83,6 +83,8 @@ def contributor_breadth_model(self) -> None:
8383

8484
cntrb_newest_events_map[gh_login] = newest_event_date
8585

86+
github_data_access = GithubDataAccess(key_auth, logger)
87+
8688
index = 1
8789
total = len(current_cntrb_logins)
8890
for cntrb in current_cntrb_logins:
@@ -98,18 +100,22 @@ def contributor_breadth_model(self) -> None:
98100

99101

100102
cntrb_events = []
101-
for page_data, page in GithubPaginator(repo_cntrb_url, key_auth, logger).iter_pages():
103+
try:
104+
for event in github_data_access.paginate_resource(repo_cntrb_url):
102105

103-
if page_data:
104-
cntrb_events += page_data
106+
cntrb_events.append(event)
105107

106-
oldest_event_on_page = datetime.strptime(page_data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ")
107-
if oldest_event_on_page < newest_event_in_db:
108-
print("Found cntrb events we already have...skipping the rest")
108+
event_age = datetime.strptime(event["created_at"], "%Y-%m-%dT%H:%M:%SZ")
109+
if event_age < newest_event_in_db:
110+
logger.info("Found cntrb events we already have...skipping the rest")
109111
break
110112

111-
if len(cntrb_events) == 0:
112-
logger.info("There are no cntrb events, or new events for this user.\n")
113+
if len(cntrb_events) == 0:
114+
logger.info("There are no cntrb events, or new events for this user.\n")
115+
continue
116+
117+
except UrlNotFoundException as e:
118+
logger.warning(e)
113119
continue
114120

115121
events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source)

0 commit comments

Comments
 (0)